Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
Ines Montani 2020-07-22 13:42:59 +02:00 committed by GitHub
parent 311d0bde29
commit 43b960c01b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
179 changed files with 6946 additions and 4619 deletions

View File

@ -17,7 +17,6 @@ import plac
import random import random
from pathlib import Path from pathlib import Path
import spacy import spacy
from spacy.kb import KnowledgeBase
from spacy.gold import Example from spacy.gold import Example
from spacy.pipeline import EntityRuler from spacy.pipeline import EntityRuler
@ -82,12 +81,16 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
# Create the Entity Linker component and add it to the pipeline. # Create the Entity Linker component and add it to the pipeline.
if "entity_linker" not in nlp.pipe_names: if "entity_linker" not in nlp.pipe_names:
kb = KnowledgeBase(vocab=nlp.vocab) print("Loading Knowledge Base from '%s'" % kb_path)
kb.load_bulk(kb_path) cfg = {
print("Loaded Knowledge Base from '%s'" % kb_path) "kb": {
"@assets": "spacy.KBFromFile.v1",
# use only the predicted EL score and not the prior probability (for demo purposes) "vocab_path": vocab_path,
cfg = {"kb": kb, "incl_prior": False} "kb_path": kb_path,
},
# use only the predicted EL score and not the prior probability (for demo purposes)
"incl_prior": False,
}
entity_linker = nlp.create_pipe("entity_linker", cfg) entity_linker = nlp.create_pipe("entity_linker", cfg)
nlp.add_pipe(entity_linker, last=True) nlp.add_pipe(entity_linker, last=True)

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a18,<8.0.0a20", "thinc>=8.0.0a19,<8.0.0a30",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations" "pytokenizations"
] ]

View File

@ -1,11 +1,11 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a18,<8.0.0a20 thinc>=8.0.0a19,<8.0.0a30
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1 ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.7.0,<1.1.0 wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0 catalogue>=0.0.7,<1.1.0
typer>=0.3.0,<0.4.0 typer>=0.3.0,<0.4.0

View File

@ -34,15 +34,15 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a18,<8.0.0a20 thinc>=8.0.0a19,<8.0.0a30
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a18,<8.0.0a20 thinc>=8.0.0a19,<8.0.0a30
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.7.0,<1.1.0 wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0 catalogue>=0.0.7,<1.1.0
typer>=0.3.0,<0.4.0 typer>=0.3.0,<0.4.0

View File

@ -32,8 +32,14 @@ MOD_NAMES = [
"spacy.attrs", "spacy.attrs",
"spacy.kb", "spacy.kb",
"spacy.morphology", "spacy.morphology",
"spacy.pipeline.pipes", "spacy.pipeline.dep_parser",
"spacy.pipeline.morphologizer", "spacy.pipeline.morphologizer",
"spacy.pipeline.multitask",
"spacy.pipeline.ner",
"spacy.pipeline.pipe",
"spacy.pipeline.sentencizer",
"spacy.pipeline.senter",
"spacy.pipeline.tagger",
"spacy.syntax.stateclass", "spacy.syntax.stateclass",
"spacy.syntax._state", "spacy.syntax._state",
"spacy.tokenizer", "spacy.tokenizer",

View File

@ -14,7 +14,6 @@ from .about import __version__
from .errors import Errors, Warnings from .errors import Errors, Warnings
from . import util from . import util
from .util import registry from .util import registry
from .language import component
if sys.maxunicode == 65535: if sys.maxunicode == 65535:

View File

@ -63,7 +63,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
result = {} result = {}
while args: while args:
opt = args.pop(0) opt = args.pop(0)
err = f"Invalid config override '{opt}'" err = f"Invalid CLI argument '{opt}'"
if opt.startswith("--"): # new argument if opt.startswith("--"): # new argument
opt = opt.replace("--", "").replace("-", "_") opt = opt.replace("--", "").replace("-", "_")
if "." not in opt: if "." not in opt:
@ -73,7 +73,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
else: else:
value = args.pop(0) value = args.pop(0)
# Just like we do in the config, we're calling json.loads on the # Just like we do in the config, we're calling json.loads on the
# values. But since they come from the CLI, it'd b unintuitive to # values. But since they come from the CLI, it'd be unintuitive to
# explicitly mark strings with escaped quotes. So we're working # explicitly mark strings with escaped quotes. So we're working
# around that here by falling back to a string if parsing fails. # around that here by falling back to a string if parsing fails.
# TODO: improve logic to handle simple types like list of strings? # TODO: improve logic to handle simple types like list of strings?
@ -82,7 +82,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
except ValueError: except ValueError:
result[opt] = str(value) result[opt] = str(value)
else: else:
msg.fail(f"{err}: options need to start with --", exits=1) msg.fail(f"{err}: override option should start with --", exits=1)
return result return result

View File

@ -3,12 +3,12 @@ from pathlib import Path
from collections import Counter from collections import Counter
import sys import sys
import srsly import srsly
from wasabi import Printer, MESSAGES, msg from wasabi import Printer, MESSAGES, msg, diff_strings
import typer import typer
from thinc.api import Config
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli from ._util import import_code, debug_cli
from ..schemas import ConfigSchema
from ..gold import Corpus, Example from ..gold import Corpus, Example
from ..syntax import nonproj from ..syntax import nonproj
from ..language import Language from ..language import Language
@ -33,6 +33,9 @@ def debug_config_cli(
ctx: typer.Context, # This is only used to read additional arguments ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True), config_path: Path = Arg(..., help="Path to config file", exists=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
# fmt: on # fmt: on
): ):
"""Debug a config.cfg file and show validation errors. The command will """Debug a config.cfg file and show validation errors. The command will
@ -40,14 +43,37 @@ def debug_config_cli(
validation errors are blocking and will prevent the rest of the config from validation errors are blocking and will prevent the rest of the config from
being resolved. This means that you may not see all validation errors at being resolved. This means that you may not see all validation errors at
once and some issues are only shown once previous errors have been fixed. once and some issues are only shown once previous errors have been fixed.
Similar as with the 'train' command, you can override settings from the config
as command line options. For instance, --training.batch_size 128 overrides
the value of "batch_size" in the block "[training]".
""" """
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
with show_validation_error(): with show_validation_error():
util.load_config( config = Config().from_disk(config_path)
config_path, create_objects=False, schema=ConfigSchema, overrides=overrides, try:
) nlp, _ = util.load_model_from_config(
msg.good("Config is valid") config, overrides=overrides, auto_fill=auto_fill
)
except ValueError as e:
msg.fail(str(e), exits=1)
is_stdout = output_path is not None and str(output_path) == "-"
if auto_fill:
orig_config = config.to_str()
filled_config = nlp.config.to_str()
if orig_config == filled_config:
msg.good("Original config is valid, no values were auto-filled")
else:
msg.good("Auto-filled config is valid")
if diff:
print(diff_strings(config.to_str(), nlp.config.to_str()))
else:
msg.good("Original config is valid", show=not is_stdout)
if is_stdout:
print(nlp.config.to_str())
elif output_path is not None:
nlp.config.to_disk(output_path)
msg.good(f"Saved updated config to {output_path}")
@debug_cli.command( @debug_cli.command(
@ -117,16 +143,13 @@ def debug_data(
if not config_path.exists(): if not config_path.exists():
msg.fail("Config file not found", config_path, exists=1) msg.fail("Config file not found", config_path, exists=1)
with show_validation_error(): with show_validation_error():
config = util.load_config( cfg = Config().from_disk(config_path)
config_path, nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
create_objects=False, # TODO: handle base model
schema=ConfigSchema,
overrides=config_overrides,
)
nlp = util.load_model_from_config(config["nlp"])
lang = config["nlp"]["lang"] lang = config["nlp"]["lang"]
base_model = config["nlp"]["base_model"] base_model = config["training"]["base_model"]
pipeline = list(config["nlp"]["pipeline"].keys()) pipeline = nlp.pipe_names
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
tag_map_path = util.ensure_path(config["training"]["tag_map"]) tag_map_path = util.ensure_path(config["training"]["tag_map"])
tag_map = {} tag_map = {}
if tag_map_path is not None: if tag_map_path is not None:
@ -164,19 +187,17 @@ def debug_data(
msg.good("Corpus is loadable") msg.good("Corpus is loadable")
# Create all gold data here to avoid iterating over the train_dataset constantly # Create all gold data here to avoid iterating over the train_dataset constantly
gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True) gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
gold_train_unpreprocessed_data = _compile_gold( gold_train_unpreprocessed_data = _compile_gold(
train_dataset, pipeline, nlp, make_proj=False train_dataset, factory_names, nlp, make_proj=False
) )
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True) gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)
train_texts = gold_train_data["texts"] train_texts = gold_train_data["texts"]
dev_texts = gold_dev_data["texts"] dev_texts = gold_dev_data["texts"]
msg.divider("Training stats") msg.divider("Training stats")
msg.text(f"Training pipeline: {', '.join(pipeline)}") msg.text(f"Training pipeline: {', '.join(pipeline)}")
for pipe in [p for p in pipeline if p not in nlp.factories]:
msg.fail(f"Pipeline component '{pipe}' not available in factories")
if base_model: if base_model:
msg.text(f"Starting with base model '{base_model}'") msg.text(f"Starting with base model '{base_model}'")
else: else:
@ -244,7 +265,7 @@ def debug_data(
else: else:
msg.info("No word vectors present in the model") msg.info("No word vectors present in the model")
if "ner" in pipeline: if "ner" in factory_names:
# Get all unique NER labels present in the data # Get all unique NER labels present in the data
labels = set( labels = set(
label for label in gold_train_data["ner"] if label not in ("O", "-", None) label for label in gold_train_data["ner"] if label not in ("O", "-", None)
@ -332,7 +353,7 @@ def debug_data(
"with punctuation can not be trained with a noise level > 0." "with punctuation can not be trained with a noise level > 0."
) )
if "textcat" in pipeline: if "textcat" in factory_names:
msg.divider("Text Classification") msg.divider("Text Classification")
labels = [label for label in gold_train_data["cats"]] labels = [label for label in gold_train_data["cats"]]
model_labels = _get_labels_from_model(nlp, "textcat") model_labels = _get_labels_from_model(nlp, "textcat")
@ -379,7 +400,7 @@ def debug_data(
"contains only instances with mutually-exclusive classes." "contains only instances with mutually-exclusive classes."
) )
if "tagger" in pipeline: if "tagger" in factory_names:
msg.divider("Part-of-speech Tagging") msg.divider("Part-of-speech Tagging")
labels = [label for label in gold_train_data["tags"]] labels = [label for label in gold_train_data["tags"]]
tag_map = nlp.vocab.morphology.tag_map tag_map = nlp.vocab.morphology.tag_map
@ -394,7 +415,7 @@ def debug_data(
for label in non_tagmap: for label in non_tagmap:
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'") msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
if "parser" in pipeline: if "parser" in factory_names:
has_low_data_warning = False has_low_data_warning = False
msg.divider("Dependency Parsing") msg.divider("Dependency Parsing")
@ -541,7 +562,10 @@ def _load_file(file_path: Path, msg: Printer) -> None:
def _compile_gold( def _compile_gold(
examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool examples: Sequence[Example],
factory_names: List[str],
nlp: Language,
make_proj: bool,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
data = { data = {
"ner": Counter(), "ner": Counter(),
@ -573,7 +597,7 @@ def _compile_gold(
for word in valid_words: for word in valid_words:
if nlp.vocab.strings[word] not in nlp.vocab.vectors: if nlp.vocab.strings[word] not in nlp.vocab.vectors:
data["words_missing_vectors"].update([word]) data["words_missing_vectors"].update([word])
if "ner" in pipeline: if "ner" in factory_names:
for i, label in enumerate(eg.get_aligned_ner()): for i, label in enumerate(eg.get_aligned_ner()):
if label is None: if label is None:
continue continue
@ -595,14 +619,14 @@ def _compile_gold(
data["ner"][combined_label] += 1 data["ner"][combined_label] += 1
elif label == "-": elif label == "-":
data["ner"]["-"] += 1 data["ner"]["-"] += 1
if "textcat" in pipeline: if "textcat" in factory_names:
data["cats"].update(gold.cats) data["cats"].update(gold.cats)
if list(gold.cats.values()).count(1.0) != 1: if list(gold.cats.values()).count(1.0) != 1:
data["n_cats_multilabel"] += 1 data["n_cats_multilabel"] += 1
if "tagger" in pipeline: if "tagger" in factory_names:
tags = eg.get_aligned("TAG", as_string=True) tags = eg.get_aligned("TAG", as_string=True)
data["tags"].update([x for x in tags if x is not None]) data["tags"].update([x for x in tags if x is not None])
if "parser" in pipeline: if "parser" in factory_names:
aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj) aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
data["deps"].update([x for x in aligned_deps if x is not None]) data["deps"].update([x for x in aligned_deps if x is not None])
for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)): for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):

View File

@ -1,8 +1,11 @@
from typing import Dict, Any, Optional
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
from thinc.api import Model
import typer
from ._util import Arg, Opt, debug_cli from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
from .. import util from .. import util
from ..lang.en import English from ..lang.en import English
@ -10,8 +13,10 @@ from ..lang.en import English
@debug_cli.command("model") @debug_cli.command("model")
def debug_model_cli( def debug_model_cli(
# fmt: off # fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True), config_path: Path = Arg(..., help="Path to config file", exists=True),
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"), section: str = Arg(..., help="Section that defines the model to be analysed"),
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"), dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"), parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"), gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
@ -20,14 +25,18 @@ def debug_model_cli(
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"), P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"), P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"), P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU")
seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
# fmt: on # fmt: on
): ):
""" """
Analyze a Thinc model implementation. Includes checks for internal structure Analyze a Thinc model implementation. Includes checks for internal structure
and activations during training. and activations during training.
""" """
if use_gpu >= 0:
msg.info("Using GPU")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
print_settings = { print_settings = {
"dimensions": dimensions, "dimensions": dimensions,
"parameters": parameters, "parameters": parameters,
@ -39,27 +48,47 @@ def debug_model_cli(
"print_after_training": P2, "print_after_training": P2,
"print_prediction": P3, "print_prediction": P3,
} }
config_overrides = parse_config_overrides(ctx.args)
cfg = Config().from_disk(config_path)
with show_validation_error():
try:
_, config = util.load_model_from_config(cfg, overrides=config_overrides)
except ValueError as e:
msg.fail(str(e), exits=1)
seed = config["pretraining"]["seed"]
if seed is not None: if seed is not None:
msg.info(f"Fixing random seed: {seed}") msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed) fix_random_seed(seed)
if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}") component = config
require_gpu(use_gpu) parts = section.split(".")
for item in parts:
try:
component = component[item]
except KeyError:
msg.fail(
f"The section '{section}' is not a valid section in the provided config.",
exits=1,
)
if hasattr(component, "model"):
model = component.model
else: else:
msg.info(f"Using CPU") msg.fail(
f"The section '{section}' does not specify an object that holds a Model.",
debug_model( exits=1,
config_path, print_settings=print_settings, )
) debug_model(model, print_settings=print_settings)
def debug_model(config_path: Path, *, print_settings=None): def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
if not isinstance(model, Model):
msg.fail(
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
exits=1,
)
if print_settings is None: if print_settings is None:
print_settings = {} print_settings = {}
model = util.load_config(config_path, create_objects=True)["model"]
# STEP 0: Printing before training # STEP 0: Printing before training
msg.info(f"Analysing model with ID {model.id}") msg.info(f"Analysing model with ID {model.id}")
if print_settings.get("print_before_training"): if print_settings.get("print_before_training"):
@ -67,7 +96,9 @@ def debug_model(config_path: Path, *, print_settings=None):
_print_model(model, print_settings) _print_model(model, print_settings)
# STEP 1: Initializing the model and printing again # STEP 1: Initializing the model and printing again
model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp)) Y = _get_output(model.ops.xp)
_set_output_dim(nO=Y.shape[-1], model=model)
model.initialize(X=_get_docs(), Y=Y)
if print_settings.get("print_after_init"): if print_settings.get("print_after_init"):
msg.info(f"After initialization:") msg.info(f"After initialization:")
_print_model(model, print_settings) _print_model(model, print_settings)
@ -110,12 +141,16 @@ def _get_docs():
def _get_output(xp): def _get_output(xp):
return xp.asarray( return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
[
xp.asarray([i + 10, i + 20, i + 30], dtype="float32")
for i, _ in enumerate(_get_docs()) def _set_output_dim(model, nO):
] # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
) if model.has_dim("nO") is None:
model.set_dim("nO", nO)
if model.has_ref("output_layer"):
if model.get_ref("output_layer").has_dim("nO") is None:
model.get_ref("output_layer").set_dim("nO", nO)
def _print_model(model, print_settings): def _print_model(model, print_settings):

View File

@ -105,9 +105,10 @@ def evaluate(
print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat) print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
if displacy_path: if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
docs = [ex.predicted for ex in dev_dataset] docs = [ex.predicted for ex in dev_dataset]
render_deps = "parser" in nlp.meta.get("pipeline", []) render_deps = "parser" in factory_names
render_ents = "ner" in nlp.meta.get("pipeline", []) render_ents = "ner" in factory_names
render_parses( render_parses(
docs, docs,
displacy_path, displacy_path,

View File

@ -84,7 +84,6 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
msg.fail("Can't find model meta.json", meta_path, exits=1) msg.fail("Can't find model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path) meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path: if model_path.resolve() != model_path:
meta["link"] = str(model_path)
meta["source"] = str(model_path.resolve()) meta["source"] = str(model_path.resolve())
else: else:
meta["source"] = str(model_path) meta["source"] = str(model_path)

View File

@ -125,7 +125,6 @@ def get_meta(
meta.update(existing_meta) meta.update(existing_meta)
nlp = util.load_model_from_path(Path(model_path)) nlp = util.load_model_from_path(Path(model_path))
meta["spacy_version"] = util.get_model_version_range(about.__version__) meta["spacy_version"] = util.get_model_version_range(about.__version__)
meta["pipeline"] = nlp.pipe_names
meta["vectors"] = { meta["vectors"] = {
"width": nlp.vocab.vectors_length, "width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors), "vectors": len(nlp.vocab.vectors),

View File

@ -5,7 +5,7 @@ import time
import re import re
from collections import Counter from collections import Counter
from pathlib import Path from pathlib import Path
from thinc.api import use_pytorch_for_gpu_memory, require_gpu from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import CosineDistance, L2Distance from thinc.api import CosineDistance, L2Distance
from wasabi import msg from wasabi import msg
@ -15,7 +15,6 @@ import typer
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code from ._util import import_code
from ..schemas import ConfigSchema
from ..errors import Errors from ..errors import Errors
from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model
@ -37,6 +36,7 @@ def pretrain_cli(
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
# fmt: on # fmt: on
): ):
""" """
@ -67,6 +67,7 @@ def pretrain_cli(
config_overrides=overrides, config_overrides=overrides,
resume_path=resume_path, resume_path=resume_path,
epoch_resume=epoch_resume, epoch_resume=epoch_resume,
use_gpu=use_gpu,
) )
@ -77,40 +78,29 @@ def pretrain(
config_overrides: Dict[str, Any] = {}, config_overrides: Dict[str, Any] = {},
resume_path: Optional[Path] = None, resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None, epoch_resume: Optional[int] = None,
use_gpu: int = -1,
): ):
verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume) verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
msg.info(f"Loading config from: {config_path}")
with show_validation_error():
config = util.load_config(
config_path,
create_objects=False,
validate=True,
schema=ConfigSchema,
overrides=config_overrides,
)
if not output_dir.exists():
output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}")
use_gpu = config["training"]["use_gpu"]
if use_gpu >= 0: if use_gpu >= 0:
msg.info("Using GPU") msg.info("Using GPU")
require_gpu(use_gpu) require_gpu(use_gpu)
else: else:
msg.info("Using CPU") msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
config = Config().from_disk(config_path)
with show_validation_error():
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
# TODO: validate that [pretraining] block exists
if not output_dir.exists():
output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}")
seed = config["pretraining"]["seed"] seed = config["pretraining"]["seed"]
if seed is not None: if seed is not None:
fix_random_seed(seed) fix_random_seed(seed)
if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]: if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
use_pytorch_for_gpu_memory() use_pytorch_for_gpu_memory()
config.to_disk(output_dir / "config.cfg")
nlp_config = config["nlp"]
srsly.write_json(output_dir / "config.json", config)
msg.good("Saved config file in the output directory") msg.good("Saved config file in the output directory")
config = util.load_config(config_path, create_objects=True)
nlp = util.load_model_from_config(nlp_config)
pretrain_config = config["pretraining"] pretrain_config = config["pretraining"]
if texts_loc != "-": # reading from a file if texts_loc != "-": # reading from a file

View File

@ -25,7 +25,7 @@ def profile_cli(
# fmt: on # fmt: on
): ):
""" """
Profile a spaCy pipeline, to find out which functions take the most time. Profile which functions take the most time in a spaCy pipeline.
Input should be formatted as one JSON object per line with a key "text". Input should be formatted as one JSON object per line with a key "text".
It can either be provided as a JSONL file, or be read from sys.sytdin. It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc. If no input file is specified, the IMDB dataset is loaded via Thinc.

View File

@ -1,4 +1,4 @@
from typing import Optional, Dict, Any from typing import Optional, Dict, Any, Tuple, Union, Callable, List
from timeit import default_timer as timer from timeit import default_timer as timer
import srsly import srsly
import tqdm import tqdm
@ -7,6 +7,7 @@ from wasabi import msg
import thinc import thinc
import thinc.schedules import thinc.schedules
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
from thinc.api import Config, Optimizer
import random import random
import typer import typer
@ -14,18 +15,15 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code from ._util import import_code
from ..gold import Corpus, Example from ..gold import Corpus, Example
from ..lookups import Lookups from ..lookups import Lookups
from ..language import Language
from .. import util from .. import util
from ..errors import Errors from ..errors import Errors
from ..schemas import ConfigSchema
# Don't remove - required to load the built-in architectures # Don't remove - required to load the built-in architectures
from ..ml import models # noqa: F401 from ..ml import models # noqa: F401
registry = util.registry
@app.command( @app.command(
"train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} "train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
) )
@ -38,6 +36,8 @@ def train_cli(
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
# fmt: on # fmt: on
): ):
""" """
@ -53,9 +53,7 @@ def train_cli(
referenced in the config. referenced in the config.
""" """
util.set_env_log(verbose) util.set_env_log(verbose)
verify_cli_args( verify_cli_args(train_path, dev_path, config_path)
train_path=train_path, dev_path=dev_path, config_path=config_path,
)
overrides = parse_config_overrides(ctx.args) overrides = parse_config_overrides(ctx.args)
import_code(code_path) import_code(code_path)
train( train(
@ -63,6 +61,8 @@ def train_cli(
{"train": train_path, "dev": dev_path}, {"train": train_path, "dev": dev_path},
output_path=output_path, output_path=output_path,
config_overrides=overrides, config_overrides=overrides,
use_gpu=use_gpu,
resume_training=resume,
) )
@ -72,63 +72,53 @@ def train(
raw_text: Optional[Path] = None, raw_text: Optional[Path] = None,
output_path: Optional[Path] = None, output_path: Optional[Path] = None,
config_overrides: Dict[str, Any] = {}, config_overrides: Dict[str, Any] = {},
use_gpu: int = -1,
resume_training: bool = False,
) -> None: ) -> None:
msg.info(f"Loading config from: {config_path}")
# Read the config first without creating objects, to get to the original nlp_config
with show_validation_error():
config = util.load_config(
config_path,
create_objects=False,
schema=ConfigSchema,
overrides=config_overrides,
)
use_gpu = config["training"]["use_gpu"]
if use_gpu >= 0: if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}") msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu) require_gpu(use_gpu)
else: else:
msg.info("Using CPU") msg.info("Using CPU")
msg.info(f"Loading config and nlp from: {config_path}")
config = Config().from_disk(config_path)
with show_validation_error():
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
if config["training"]["base_model"]:
base_nlp = util.load_model(config["training"]["base_model"])
# TODO: do something to check base_nlp against regular nlp described in config?
nlp = base_nlp
verify_config(nlp)
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
if config["training"]["seed"] is not None: if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"]) fix_random_seed(config["training"]["seed"])
if config["training"].get("use_pytorch_for_gpu_memory"): if config["training"]["use_pytorch_for_gpu_memory"]:
# It feels kind of weird to not have a default for this. # It feels kind of weird to not have a default for this.
use_pytorch_for_gpu_memory() use_pytorch_for_gpu_memory()
nlp_config = config["nlp"]
config = util.load_config(
config_path,
create_objects=True,
schema=ConfigSchema,
overrides=config_overrides,
)
training = config["training"] training = config["training"]
msg.info("Creating nlp from config")
nlp = util.load_model_from_config(nlp_config)
optimizer = training["optimizer"] optimizer = training["optimizer"]
limit = training["limit"] limit = training["limit"]
corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit) corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
if "textcat" in nlp_config["pipeline"]: if resume_training:
verify_textcat_config(nlp, nlp_config)
if training.get("resume", False):
msg.info("Resuming training") msg.info("Resuming training")
nlp.resume_training() nlp.resume_training()
else: else:
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
train_examples = list( train_examples = corpus.train_dataset(
corpus.train_dataset( nlp,
nlp, shuffle=False,
shuffle=False, gold_preproc=training["gold_preproc"],
gold_preproc=training["gold_preproc"], max_length=training["max_length"],
max_length=training["max_length"],
)
) )
train_examples = list(train_examples)
nlp.begin_training(lambda: train_examples) nlp.begin_training(lambda: train_examples)
# Replace tag map with provided mapping if tag_map:
nlp.vocab.morphology.load_tag_map(tag_map) # Replace tag map with provided mapping
nlp.vocab.morphology.load_tag_map(tag_map)
# Load morph rules if morph_rules:
nlp.vocab.morphology.load_morph_exceptions(morph_rules) # Load morph rules
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
# Create empty extra lexeme tables so the data from spacy-lookups-data # Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed # isn't loaded if these features are accessed
@ -151,9 +141,8 @@ def train(
for subpath in tok2vec_path.split("."): for subpath in tok2vec_path.split("."):
tok2vec = tok2vec.get(subpath) tok2vec = tok2vec.get(subpath)
if not tok2vec: if not tok2vec:
msg.fail( err = f"Could not locate the tok2vec model at {tok2vec_path}"
f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1, msg.fail(err, exits=1)
)
tok2vec.from_bytes(weights_data) tok2vec.from_bytes(weights_data)
msg.info("Loading training corpus") msg.info("Loading training corpus")
@ -169,12 +158,11 @@ def train(
evaluate, evaluate,
dropout=training["dropout"], dropout=training["dropout"],
accumulate_gradient=training["accumulate_gradient"], accumulate_gradient=training["accumulate_gradient"],
patience=training.get("patience", 0), patience=training["patience"],
max_steps=training.get("max_steps", 0), max_steps=training["max_steps"],
eval_frequency=training["eval_frequency"], eval_frequency=training["eval_frequency"],
raw_text=raw_text, raw_text=raw_text,
) )
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
print_row = setup_printer(training, nlp) print_row = setup_printer(training, nlp)
@ -209,8 +197,10 @@ def train(
msg.good(f"Saved model to output directory {final_model_path}") msg.good(f"Saved model to output directory {final_model_path}")
def create_train_batches(nlp, corpus, cfg): def create_train_batches(
max_epochs = cfg.get("max_epochs", 0) nlp: Language, corpus: Corpus, cfg: Union[Config, Dict[str, Any]]
):
max_epochs = cfg["max_epochs"]
train_examples = list( train_examples = list(
corpus.train_dataset( corpus.train_dataset(
nlp, nlp,
@ -219,9 +209,8 @@ def create_train_batches(nlp, corpus, cfg):
max_length=cfg["max_length"], max_length=cfg["max_length"],
) )
) )
epoch = 0 epoch = 0
batch_strategy = cfg.get("batch_by", "sequences") batch_strategy = cfg["batch_by"]
while True: while True:
if len(train_examples) == 0: if len(train_examples) == 0:
raise ValueError(Errors.E988) raise ValueError(Errors.E988)
@ -241,7 +230,6 @@ def create_train_batches(nlp, corpus, cfg):
) )
else: else:
batches = util.minibatch(train_examples, size=cfg["batch_size"]) batches = util.minibatch(train_examples, size=cfg["batch_size"])
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
try: try:
first = next(batches) first = next(batches)
@ -255,18 +243,20 @@ def create_train_batches(nlp, corpus, cfg):
random.shuffle(train_examples) random.shuffle(train_examples)
def create_evaluation_callback(nlp, optimizer, corpus, cfg): def create_evaluation_callback(
def evaluate(): nlp: Language,
dev_examples = list( optimizer: Optimizer,
corpus.dev_dataset( corpus: Corpus,
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True cfg: Union[Config, Dict[str, Any]],
) ) -> Callable[[], Tuple[float, Dict[str, float]]]:
def evaluate() -> Tuple[float, Dict[str, float]]:
dev_examples = corpus.dev_dataset(
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
) )
dev_examples = list(dev_examples)
n_words = sum(len(ex.predicted) for ex in dev_examples) n_words = sum(len(ex.predicted) for ex in dev_examples)
batch_size = cfg.get("evaluation_batch_size", 128) batch_size = cfg["eval_batch_size"]
start_time = timer() start_time = timer()
if optimizer.averages: if optimizer.averages:
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
scorer = nlp.evaluate(dev_examples, batch_size=batch_size) scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
@ -280,12 +270,9 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
try: try:
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
except KeyError as e: except KeyError as e:
raise KeyError( keys = list(scores.keys())
Errors.E983.format( err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
dict="score_weights", key=str(e), keys=list(scores.keys()) raise KeyError(err)
)
)
scores["speed"] = wps scores["speed"] = wps
return weighted_score, scores return weighted_score, scores
@ -293,17 +280,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
def train_while_improving( def train_while_improving(
nlp, nlp: Language,
optimizer, optimizer: Optimizer,
train_data, train_data,
evaluate, evaluate,
*, *,
dropout, dropout: float,
eval_frequency, eval_frequency: int,
accumulate_gradient=1, accumulate_gradient: int,
patience=0, patience: int,
max_steps=0, max_steps: int,
raw_text=None, raw_text: List[Dict[str, str]],
): ):
"""Train until an evaluation stops improving. Works as a generator, """Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@ -414,7 +401,9 @@ def subdivide_batch(batch, accumulate_gradient):
yield subbatch yield subbatch
def setup_printer(training, nlp): def setup_printer(
training: Union[Dict[str, Any], Config], nlp: Language
) -> Callable[[Dict[str, Any]], None]:
score_cols = training["scores"] score_cols = training["scores"]
score_widths = [max(len(col), 6) for col in score_cols] score_widths = [max(len(col), 6) for col in score_cols]
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
@ -423,11 +412,10 @@ def setup_printer(training, nlp):
table_header = [col.upper() for col in table_header] table_header = [col.upper() for col in table_header]
table_widths = [3, 6] + loss_widths + score_widths + [6] table_widths = [3, 6] + loss_widths + score_widths + [6]
table_aligns = ["r" for _ in table_widths] table_aligns = ["r" for _ in table_widths]
msg.row(table_header, widths=table_widths) msg.row(table_header, widths=table_widths)
msg.row(["-" * width for width in table_widths]) msg.row(["-" * width for width in table_widths])
def print_row(info): def print_row(info: Dict[str, Any]) -> None:
try: try:
losses = [ losses = [
"{0:.2f}".format(float(info["losses"][pipe_name])) "{0:.2f}".format(float(info["losses"][pipe_name]))
@ -463,7 +451,9 @@ def setup_printer(training, nlp):
return print_row return print_row
def update_meta(training, nlp, info): def update_meta(
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
) -> None:
score_cols = training["scores"] score_cols = training["scores"]
nlp.meta["performance"] = {} nlp.meta["performance"] = {}
for metric in score_cols: for metric in score_cols:
@ -472,7 +462,9 @@ def update_meta(training, nlp, info):
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
def load_from_paths(config): def load_from_paths(
config: Config,
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
# TODO: separate checks from loading # TODO: separate checks from loading
raw_text = util.ensure_path(config["training"]["raw_text"]) raw_text = util.ensure_path(config["training"]["raw_text"])
if raw_text is not None: if raw_text is not None:
@ -506,7 +498,7 @@ def verify_cli_args(
dev_path: Path, dev_path: Path,
config_path: Path, config_path: Path,
output_path: Optional[Path] = None, output_path: Optional[Path] = None,
): ) -> None:
# Make sure all files and paths exists if they are needed # Make sure all files and paths exists if they are needed
if not config_path or not config_path.exists(): if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1) msg.fail("Config file not found", config_path, exits=1)
@ -528,12 +520,23 @@ def verify_cli_args(
) )
def verify_textcat_config(nlp, nlp_config): def verify_config(nlp: Language) -> None:
"""Perform additional checks based on the config and loaded nlp object."""
# TODO: maybe we should validate based on the actual components, the list
# in config["nlp"]["pipeline"] instead?
for pipe_config in nlp.config["components"].values():
# We can't assume that the component name == the factory
factory = pipe_config["@factories"]
if factory == "textcat":
verify_textcat_config(nlp, pipe_config)
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
# if 'positive_label' is provided: double check whether it's in the data and # if 'positive_label' is provided: double check whether it's in the data and
# the task is binary # the task is binary
if nlp_config["pipeline"]["textcat"].get("positive_label", None): if pipe_config.get("positive_label"):
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
pos_label = nlp_config["pipeline"]["textcat"]["positive_label"] pos_label = pipe_config.get("positive_label")
if pos_label not in textcat_labels: if pos_label not in textcat_labels:
msg.fail( msg.fail(
f"The textcat's 'positive_label' config setting '{pos_label}' " f"The textcat's 'positive_label' config setting '{pos_label}' "

102
spacy/default_config.cfg Normal file
View File

@ -0,0 +1,102 @@
[nlp]
lang = null
stop_words = []
lex_attr_getters = {}
pipeline = []
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.writing_system]
direction = "ltr"
has_case = true
has_letters = true
[components]
# Training hyper-parameters and additional features.
[training]
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false
# Limitations on training document length or number of examples.
max_length = 5000
limit = 0
# Data augmentation
orth_variant_level = 0.0
dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
eval_batch_size = 128
# Other settings
seed = 0
accumulate_gradient = 1
use_pytorch_for_gpu_memory = false
# Control how scores are printed and checkpoints are evaluated.
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
# These settings are invalid for the transformer models.
init_tok2vec = null
discard_oversize = false
omit_extra_lookups = false
batch_by = "sequences"
raw_text = null
tag_map = null
morph_rules = null
base_model = null
vectors = null
[training.batch_size]
@schedules = "compounding.v1"
start = 1000
stop = 1000
compound = 1.001
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 1e-8
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.001
[pretraining]
max_epochs = 1000
min_length = 5
max_length = 500
dropout = 0.2
n_save_every = null
batch_size = 3000
seed = ${training:seed}
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
tok2vec_model = "components.tok2vec.model"
[pretraining.objective]
type = "characters"
n_characters = 4
[pretraining.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001

View File

@ -124,20 +124,24 @@ class Warnings:
@add_codes @add_codes
class Errors: class Errors:
E001 = ("No component '{name}' found in pipeline. Available names: {opts}") E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
E002 = ("Can't find factory for '{name}'. This usually happens when spaCy " E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
"calls `nlp.create_pipe` with a component name that's not built " "This usually happens when spaCy calls nlp.{method} with custom "
"in - for example, when constructing the pipeline from a model's " "component name that's not registered on the current language class. "
"meta.json. If you're using a custom component, you can write to " "If you're using a custom component, make sure you've added the "
"`Language.factories['{name}']` or remove it from the model meta " "decorator @Language.component (for function components) or "
"and add it via `nlp.add_pipe` instead.") "@Language.factory (for class components).\n\nAvailable "
"factories: {opts}")
E003 = ("Not a valid pipeline component. Expected callable, but " E003 = ("Not a valid pipeline component. Expected callable, but "
"got {component} (name: '{name}').") "got {component} (name: '{name}'). If you're using a custom "
E004 = ("If you meant to add a built-in component, use `create_pipe`: " "component factory, double-check that it correctly returns your "
"`nlp.add_pipe(nlp.create_pipe('{component}'))`") "initialized component.")
E004 = ("Can't set up pipeline component: a factory for '{name}' already exists.")
E005 = ("Pipeline component '{name}' returned None. If you're using a " E005 = ("Pipeline component '{name}' returned None. If you're using a "
"custom component, maybe you forgot to return the processed Doc?") "custom component, maybe you forgot to return the processed Doc?")
E006 = ("Invalid constraints. You can only set one of the following: " E006 = ("Invalid constraints for adding pipeline component. You can only "
"before, after, first, last.") "set one of the following: before (component name or index), "
"after (component name or index), first (True) or last (True). "
"Invalid configuration: {args}. Existing components: {opts}")
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}") E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
E008 = ("Some current components would be lost when restoring previous " E008 = ("Some current components would be lost when restoring previous "
"pipeline state. If you added components after calling " "pipeline state. If you added components after calling "
@ -184,7 +188,7 @@ class Errors:
"the documentation:\nhttps://spacy.io/usage/models") "the documentation:\nhttps://spacy.io/usage/models")
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' " E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: " "component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')). " "nlp.add_pipe('sentencizer'). "
"Alternatively, add the dependency parser, or set sentence " "Alternatively, add the dependency parser, or set sentence "
"boundaries by setting doc[i].is_sent_start.") "boundaries by setting doc[i].is_sent_start.")
E031 = ("Invalid token: empty string ('') at position {i}.") E031 = ("Invalid token: empty string ('') at position {i}.")
@ -365,8 +369,6 @@ class Errors:
E133 = ("The sum of prior probabilities for alias '{alias}' should not " E133 = ("The sum of prior probabilities for alias '{alias}' should not "
"exceed 1, but found {sum}.") "exceed 1, but found {sum}.")
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
"`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure " E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
"to provide a valid JSON object as input with either the `text` " "to provide a valid JSON object as input with either the `text` "
"or `tokens` key. For more info, see the docs:\n" "or `tokens` key. For more info, see the docs:\n"
@ -484,6 +486,62 @@ class Errors:
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}")
E957 = ("Writing directly to Language.factories isn't needed anymore in "
"spaCy v3. Instead, you can use the @Language.factory decorator "
"to register your custom component factory or @Language.component "
"to register a simple stateless function component that just takes "
"a Doc and returns it.")
E958 = ("Language code defined in config ({bad_lang_code}) does not match "
"language code of current Language subclass {lang} ({lang_code})")
E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
E960 = ("No config data found for component '{name}'. This is likely a bug "
"in spaCy.")
E961 = ("Found non-serializable Python object in config. Configs should "
"only include values that can be serialized to JSON. If you need "
"to pass models or other objects to your component, use a reference "
"to a registered function or initialize the object in your "
"component.\n\n{config}")
E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
"got: {cfg_type}.")
E963 = ("Can't read component info from @Language.{decorator} decorator. "
"Maybe you forgot to call it? Make sure you're using "
"@Language.{decorator}() instead of @Language.{decorator}.")
E964 = ("The pipeline component factory for '{name}' needs to have the "
"following named arguments, which are passed in by spaCy:\n- nlp: "
"receives the current nlp object and lets you access the vocab\n- "
"name: the name of the component instance, can be used to identify "
"the component, output losses etc.")
E965 = ("It looks like you're using the @Language.component decorator to "
"register '{name}' on a class instead of a function component. If "
"you need to register a class or function that *returns* a component "
"function, use the @Language.factory decorator instead.")
E966 = ("nlp.add_pipe now takes the string name of the registered component "
"factory, not a callable component. Expected string, but got "
"{component} (name: '{name}').\n\n- If you created your component "
"with nlp.create_pipe('name'): remove nlp.create_pipe and call "
"nlp.add_pipe('name') instead.\n\n- If you passed in a component "
"like TextCategorizer(): call nlp.add_pipe with the string name "
"instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom "
"component: Add the decorator @Language.component (for function "
"components) or @Language.factory (for class components / factories) "
"to your custom component and assign it a name, e.g. "
"@Language.component('your_name'). You can then run "
"nlp.add_pipe('your_name') to add it to the pipeline.")
E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
E968 = ("nlp.replace_pipe now takes the string name of the registered component "
"factory, not a callable component. Expected string, but got "
"{component}.\n\n- If you created your component with"
"with nlp.create_pipe('name'): remove nlp.create_pipe and call "
"nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a "
"component like TextCategorizer(): call nlp.replace_pipe with the "
"string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n"
"- If you're using a custom component: Add the decorator "
"@Language.component (for function components) or @Language.factory "
"(for class components / factories) to your custom component and "
"assign it a name, e.g. @Language.component('your_name'). You can "
"then run nlp.replace_pipe('{name}', 'your_name').")
E969 = ("Expected string values for field '{field}', but received {types} instead. ") E969 = ("Expected string values for field '{field}', but received {types} instead. ")
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
@ -506,10 +564,12 @@ class Errors:
"into {values}, but found {value}.") "into {values}, but found {value}.")
E983 = ("Invalid key for '{dict}': {key}. Available keys: " E983 = ("Invalid key for '{dict}': {key}. Available keys: "
"{keys}") "{keys}")
E985 = ("The pipeline component '{component}' is already available in the base " E984 = ("Invalid component config for '{name}': no @factories key "
"model. The settings in the component block in the config file are " "specifying the registered function used to initialize the "
"being ignored. If you want to replace this component instead, set " "component. For example, @factories = \"ner\" will use the 'ner' "
"'replace' to True in the training configuration.") "factory and all other settings in the block will be passed "
"to it as arguments.\n\n{config}")
E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
E986 = ("Could not create any training batches: check your input. " E986 = ("Could not create any training batches: check your input. "
"Perhaps discard_oversize should be set to False ?") "Perhaps discard_oversize should be set to False ?")
E987 = ("The text of an example training instance is either a Doc or " E987 = ("The text of an example training instance is either a Doc or "
@ -530,9 +590,9 @@ class Errors:
E992 = ("The function `select_pipes` was called with `enable`={enable} " E992 = ("The function `select_pipes` was called with `enable`={enable} "
"and `disable`={disable} but that information is conflicting " "and `disable`={disable} but that information is conflicting "
"for the `nlp` pipeline with components {names}.") "for the `nlp` pipeline with components {names}.")
E993 = ("The config for 'nlp' should include either a key 'name' to " E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
"refer to an existing model by name or path, or a key 'lang' " "the code of the language to initialize it with (for example "
"to create a new blank model.") "'en' for English).\n\n{config}")
E996 = ("Could not parse {file}: {msg}") E996 = ("Could not parse {file}: {msg}")
E997 = ("Tokenizer special cases are not allowed to modify the text. " E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes " "This would map '{chunk}' to '{orth}' given token attributes "
@ -540,9 +600,9 @@ class Errors:
E999 = ("Unable to merge the `Doc` objects because they do not all share " E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.") "the same `Vocab`.")
E1000 = ("No pkuseg model available. Provide a pkuseg model when " E1000 = ("No pkuseg model available. Provide a pkuseg model when "
"initializing the pipeline: " "initializing the pipeline:\n"
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; ' 'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`') 'nlp = Chinese(config=cfg)')
@add_codes @add_codes

View File

@ -1,10 +1,9 @@
import re import re
from .conll_ner2docs import n_sents_info from .conll_ner2docs import n_sents_info
from ...gold import Example
from ...gold import iob_to_biluo, spans_from_biluo_tags from ...gold import iob_to_biluo, spans_from_biluo_tags
from ...language import Language
from ...tokens import Doc, Token, Span from ...tokens import Doc, Token, Span
from ...vocab import Vocab
from wasabi import Printer from wasabi import Printer
@ -73,7 +72,7 @@ def read_conllx(
ner_map=None, ner_map=None,
): ):
""" Yield docs, one for each sentence """ """ Yield docs, one for each sentence """
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc vocab = Vocab() # need vocab to make a minimal Doc
for sent in input_data.strip().split("\n\n"): for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n") lines = sent.strip().split("\n")
if lines: if lines:

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class AfrikaansDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "af" lang = "af"
stop_words = STOP_WORDS stop_words = {"@language_data": "spacy.af.stop_words"}
"""
@registry.language_data("spacy.af.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Afrikaans(Language): class Afrikaans(Language):
lang = "af" lang = "af"
Defaults = AfrikaansDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Afrikaans"] __all__ = ["Afrikaans"]

View File

@ -1,31 +1,48 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "ar"
stop_words = {"@language_data": "spacy.ar.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
"""
@registry.language_data("spacy.ar.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ar.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class ArabicDefaults(Language.Defaults): class ArabicDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ar"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Arabic(Language): class Arabic(Language):
lang = "ar" lang = "ar"
Defaults = ArabicDefaults Defaults = ArabicDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Arabic"] __all__ = ["Arabic"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class BulgarianDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "bg" lang = "bg"
stop_words = STOP_WORDS stop_words = {"@language_data": "spacy.bg.stop_words"}
"""
@registry.language_data("spacy.bg.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Bulgarian(Language): class Bulgarian(Language):
lang = "bg" lang = "bg"
Defaults = BulgarianDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Bulgarian"] __all__ = ["Bulgarian"]

View File

@ -1,18 +1,35 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import update_exc, registry
from ...util import update_exc
DEFAULT_CONFIG = """
[nlp]
lang = "bn"
stop_words = {"@language_data": "spacy.bn.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.bn.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class BengaliDefaults(Language.Defaults): class BengaliDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "bn"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
@ -21,6 +38,7 @@ class BengaliDefaults(Language.Defaults):
class Bengali(Language): class Bengali(Language):
lang = "bn" lang = "bn"
Defaults = BengaliDefaults Defaults = BengaliDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Bengali"] __all__ = ["Bengali"]

View File

@ -1,31 +1,49 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
DEFAULT_CONFIG = """
[nlp]
lang = "ca"
stop_words = {"@language_data": "spacy.ca.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.ca.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ca.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class CatalanDefaults(Language.Defaults): class CatalanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "ca"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
class Catalan(Language): class Catalan(Language):
lang = "ca" lang = "ca"
Defaults = CatalanDefaults Defaults = CatalanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Catalan"] __all__ = ["Catalan"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class CzechDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "cs" lang = "cs"
stop_words = STOP_WORDS stop_words = {"@language_data": "spacy.cs.stop_words"}
"""
@registry.language_data("spacy.cs.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Czech(Language): class Czech(Language):
lang = "cs" lang = "cs"
Defaults = CzechDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Czech"] __all__ = ["Czech"]

View File

@ -1,27 +1,50 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import update_exc, registry
from ...util import update_exc
DEFAULT_CONFIG = """
[nlp]
lang = "da"
stop_words = {"@language_data": "spacy.da.stop_words"}
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.da.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.da.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class DanishDefaults(Language.Defaults): class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "da"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
class Danish(Language): class Danish(Language):
lang = "da" lang = "da"
Defaults = DanishDefaults Defaults = DanishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Danish"] __all__ = ["Danish"]

View File

@ -1,23 +1,40 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import update_exc, registry
from ...util import update_exc
DEFAULT_CONFIG = """
[nlp]
lang = "de"
stop_words = {"@language_data": "spacy.de.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.de.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class GermanDefaults(Language.Defaults): class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "de"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
single_orth_variants = [ single_orth_variants = [
{"tags": ["$("], "variants": ["", "..."]}, {"tags": ["$("], "variants": ["", "..."]},
@ -38,6 +55,7 @@ class GermanDefaults(Language.Defaults):
class German(Language): class German(Language):
lang = "de" lang = "de"
Defaults = GermanDefaults Defaults = GermanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["German"] __all__ = ["German"]

View File

@ -1,3 +1,6 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
@ -6,32 +9,51 @@ from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...lookups import Lookups from ...util import update_exc, registry
from ...attrs import LANG
from ...util import update_exc
DEFAULT_CONFIG = """
[nlp]
lang = "el"
stop_words = {"@language_data": "spacy.el.stop_words"}
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.GreekLemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
return GreekLemmatizer(data_paths=data_paths)
@registry.language_data("spacy.el.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.el.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class GreekDefaults(Language.Defaults): class GreekDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "el"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return GreekLemmatizer(lookups)
class Greek(Language): class Greek(Language):
lang = "el" lang = "el"
Defaults = GreekDefaults Defaults = GreekDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Greek"] __all__ = ["Greek"]

View File

@ -1,3 +1,5 @@
from typing import Dict, List
from ...lemmatizer import Lemmatizer from ...lemmatizer import Lemmatizer
@ -12,7 +14,13 @@ class GreekLemmatizer(Lemmatizer):
not applicable for Greek language. not applicable for Greek language.
""" """
def lemmatize(self, string, index, exceptions, rules): def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
string = string.lower() string = string.lower()
forms = [] forms = []
if string in index: if string in index:

View File

@ -1,25 +1,50 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import is_base_form
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...lemmatizer import Lemmatizer
from ...util import update_exc from ...util import update_exc, registry
def _return_en(_): DEFAULT_CONFIG = """
return "en" [nlp]
lang = "en"
stop_words = {"@language_data": "spacy.en.stop_words"}
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.EnglishLemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.en.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.en.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = _return_en
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
single_orth_variants = [ single_orth_variants = [
@ -31,45 +56,11 @@ class EnglishDefaults(Language.Defaults):
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]}, {"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]},
] ]
@classmethod
def is_base_form(cls, univ_pos, morphology=None):
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False
class English(Language): class English(Language):
lang = "en" lang = "en"
Defaults = EnglishDefaults Defaults = EnglishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["English"] __all__ = ["English"]

View File

@ -0,0 +1,36 @@
from typing import Optional
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False

View File

@ -1,47 +1,17 @@
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
# fmt: off
_num_words = [ _num_words = [
"zero", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"one", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"two", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
"three", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
"four", "million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
"twenty",
"thirty",
"forty",
"fifty",
"sixty",
"seventy",
"eighty",
"ninety",
"hundred",
"thousand",
"million",
"billion",
"trillion",
"quadrillion",
"gajillion",
"bazillion",
] ]
# fmt: on
def like_num(text): def like_num(text: str) -> bool:
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]
text = text.replace(",", "").replace(".", "") text = text.replace(",", "").replace(".", "")

View File

@ -1,33 +1,52 @@
from typing import Set, Dict, Callable, Any
from thinc.config import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "es"
stop_words = {"@language_data": "spacy.es.stop_words"}
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.es.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.es.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SpanishDefaults(Language.Defaults): class SpanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "es"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
class Spanish(Language): class Spanish(Language):
lang = "es" lang = "es"
Defaults = SpanishDefaults Defaults = SpanishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Spanish"] __all__ = ["Spanish"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class EstonianDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "et" lang = "et"
stop_words = STOP_WORDS stop_words = {"@language_data": "spacy.et.stop_words"}
"""
@registry.language_data("spacy.et.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Estonian(Language): class Estonian(Language):
lang = "et" lang = "et"
Defaults = EstonianDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Estonian"] __all__ = ["Estonian"]

View File

@ -1,25 +1,41 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "eu"
stop_words = {"@language_data": "spacy.eu.stop_words"}
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
"""
@registry.language_data("spacy.eu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.eu.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class BasqueDefaults(Language.Defaults): class BasqueDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "eu"
tokenizer_exceptions = BASE_EXCEPTIONS tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
class Basque(Language): class Basque(Language):
lang = "eu" lang = "eu"
Defaults = BasqueDefaults Defaults = BasqueDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Basque"] __all__ = ["Basque"]

View File

@ -1,7 +1,8 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
from ..norm_exceptions import BASE_NORMS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -9,23 +10,46 @@ from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
DEFAULT_CONFIG = """
[nlp]
lang = "fa"
stop_words = {"@language_data": "spacy.fa.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.fa.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fa.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class PersianDefaults(Language.Defaults): class PersianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters[LANG] = lambda text: "fa"
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
class Persian(Language): class Persian(Language):
lang = "fa" lang = "fa"
Defaults = PersianDefaults Defaults = PersianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Persian"] __all__ = ["Persian"]

View File

@ -1,31 +1,43 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "fi"
stop_words = {"@language_data": "spacy.fi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
"""
@registry.language_data("spacy.fi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class FinnishDefaults(Language.Defaults): class FinnishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "fi"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Finnish(Language): class Finnish(Language):
lang = "fi" lang = "fi"
Defaults = FinnishDefaults Defaults = FinnishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Finnish"] __all__ = ["Finnish"]

View File

@ -1,44 +1,61 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import FrenchLemmatizer from .lemmatizer import FrenchLemmatizer, is_base_form
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lookups import Lookups from ...util import update_exc, registry
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "fr"
stop_words = {"@language_data": "spacy.fr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.FrenchLemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
@registry.language_data("spacy.fr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fr.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class FrenchDefaults(Language.Defaults): class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "fr"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return FrenchLemmatizer(lookups)
class French(Language): class French(Language):
lang = "fr" lang = "fr"
Defaults = FrenchDefaults Defaults = FrenchDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["French"] __all__ = ["French"]

View File

@ -1,3 +1,5 @@
from typing import Optional, List, Dict
from ...lemmatizer import Lemmatizer from ...lemmatizer import Lemmatizer
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ...symbols import SCONJ, CCONJ from ...symbols import SCONJ, CCONJ
@ -13,7 +15,9 @@ class FrenchLemmatizer(Lemmatizer):
the lookup table. the lookup table.
""" """
def __call__(self, string, univ_pos, morphology=None): def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
lookup_table = self.lookups.get_table("lemma_lookup", {}) lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups: if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)] return [lookup_table.get(string, string)]
@ -52,62 +56,19 @@ class FrenchLemmatizer(Lemmatizer):
) )
return lemmas return lemmas
def is_base_form(self, univ_pos, morphology=None): def lookup(self, string: str, orth: Optional[int] = None) -> str:
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology
others = [
key
for key in morphology
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
]
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
and not others
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif "VerbForm=inf" in morphology:
return True
elif "VerbForm=none" in morphology:
return True
elif "Number=sing" in morphology:
return True
elif "Degree=pos" in morphology:
return True
else:
return False
def noun(self, string, morphology=None):
return self(string, "noun", morphology)
def verb(self, string, morphology=None):
return self(string, "verb", morphology)
def adj(self, string, morphology=None):
return self(string, "adj", morphology)
def punct(self, string, morphology=None):
return self(string, "punct", morphology)
def lookup(self, string, orth=None):
lookup_table = self.lookups.get_table("lemma_lookup", {}) lookup_table = self.lookups.get_table("lemma_lookup", {})
if orth is not None and orth in lookup_table: if orth is not None and orth in lookup_table:
return lookup_table[orth][0] return lookup_table[orth][0]
return string return string
def lemmatize(self, string, index, exceptions, rules): def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
lookup_table = self.lookups.get_table("lemma_lookup", {}) lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower() string = string.lower()
forms = [] forms = []
@ -133,3 +94,41 @@ class FrenchLemmatizer(Lemmatizer):
if not forms: if not forms:
forms.append(string) forms.append(string)
return list(set(forms)) return list(set(forms))
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology
others = [
key
for key in morphology
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
]
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
and not others
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif "VerbForm=inf" in morphology:
return True
elif "VerbForm=none" in morphology:
return True
elif "Number=sing" in morphology:
return True
elif "Degree=pos" in morphology:
return True
else:
return False

View File

@ -1,23 +1,33 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import update_exc, registry
from ...util import update_exc
DEFAULT_CONFIG = """
[nlp]
lang = "ga"
stop_words = {"@language_data": "spacy.ga.stop_words"}
"""
@registry.language_data("spacy.ga.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class IrishDefaults(Language.Defaults): class IrishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "ga"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Irish(Language): class Irish(Language):
lang = "ga" lang = "ga"
Defaults = IrishDefaults Defaults = IrishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Irish"] __all__ = ["Irish"]

View File

@ -1,15 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
class GujaratiDefaults(Language.Defaults): DEFAULT_CONFIG = """
stop_words = STOP_WORDS [nlp]
lang = "gu"
stop_words = {"@language_data": "spacy.gu.stop_words"}
"""
@registry.language_data("spacy.gu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Gujarati(Language): class Gujarati(Language):
lang = "gu" lang = "gu"
Defaults = GujaratiDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Gujarati"] __all__ = ["Gujarati"]

View File

@ -1,22 +1,37 @@
from .stop_words import STOP_WORDS from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import update_exc, registry
from ...util import update_exc
DEFAULT_CONFIG = """
[nlp]
lang = "he"
stop_words = {"@language_data": "spacy.he.stop_words"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
"""
@registry.language_data("spacy.he.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class HebrewDefaults(Language.Defaults): class HebrewDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "he"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Hebrew(Language): class Hebrew(Language):
lang = "he" lang = "he"
Defaults = HebrewDefaults Defaults = HebrewDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hebrew"] __all__ = ["Hebrew"]

View File

@ -1,20 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class HindiDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters.update(LEX_ATTRS) lang = "hi"
lex_attr_getters[LANG] = lambda text: "hi" stop_words = {"@language_data": "spacy.hi.stop_words"}
stop_words = STOP_WORDS lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
"""
@registry.language_data("spacy.hi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Hindi(Language): class Hindi(Language):
lang = "hi" lang = "hi"
Defaults = HindiDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hindi"] __all__ = ["Hindi"]

View File

@ -1,25 +1,39 @@
from .stop_words import STOP_WORDS from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "hr"
stop_words = {"@language_data": "spacy.hr.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.hr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class CroatianDefaults(Language.Defaults): class CroatianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "hr"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = STOP_WORDS
class Croatian(Language): class Croatian(Language):
lang = "hr" lang = "hr"
Defaults = CroatianDefaults Defaults = CroatianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Croatian"] __all__ = ["Croatian"]

View File

@ -1,22 +1,35 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "hu"
stop_words = {"@language_data": "spacy.hu.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.hu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class HungarianDefaults(Language.Defaults): class HungarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "hu"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
@ -26,6 +39,7 @@ class HungarianDefaults(Language.Defaults):
class Hungarian(Language): class Hungarian(Language):
lang = "hu" lang = "hu"
Defaults = HungarianDefaults Defaults = HungarianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hungarian"] __all__ = ["Hungarian"]

View File

@ -1,21 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...attrs import LANG
from ...language import Language from ...language import Language
from ...util import registry
class ArmenianDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "hy" lang = "hy"
stop_words = {"@language_data": "spacy.hy.stop_words"}
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
"""
lex_attr_getters.update(LEX_ATTRS)
stop_words = STOP_WORDS @registry.language_data("spacy.hy.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hy.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Armenian(Language): class Armenian(Language):
lang = "hy" lang = "hy"
Defaults = ArmenianDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Armenian"] __all__ = ["Armenian"]

View File

@ -1,21 +1,43 @@
from typing import Set, Dict, Callable, Any
from thinc.config import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import update_exc, registry
from ...util import update_exc
DEFAULT_CONFIG = """
[nlp]
lang = "id"
stop_words = {"@language_data": "spacy.id.stop_words"}
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.id.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.id.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "id"
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
@ -25,6 +47,7 @@ class IndonesianDefaults(Language.Defaults):
class Indonesian(Language): class Indonesian(Language):
lang = "id" lang = "id"
Defaults = IndonesianDefaults Defaults = IndonesianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Indonesian"] __all__ = ["Indonesian"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class IcelandicDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "is" lang = "is"
stop_words = STOP_WORDS stop_words = {"@language_data": "spacy.is.stop_words"}
"""
@registry.language_data("spacy.is.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Icelandic(Language): class Icelandic(Language):
lang = "is" lang = "is"
Defaults = IcelandicDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Icelandic"] __all__ = ["Icelandic"]

View File

@ -1,20 +1,34 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "it"
stop_words = {"@language_data": "spacy.it.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.it.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class ItalianDefaults(Language.Defaults): class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "it"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
@ -24,6 +38,7 @@ class ItalianDefaults(Language.Defaults):
class Italian(Language): class Italian(Language):
lang = "it" lang = "it"
Defaults = ItalianDefaults Defaults = ItalianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Italian"] __all__ = ["Italian"]

View File

@ -1,21 +1,187 @@
from typing import Optional, Union, Dict, Any, Set
from pathlib import Path
import srsly import srsly
from collections import namedtuple, OrderedDict from collections import namedtuple
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP from .tag_bigram_map import TAG_BIGRAM_MAP
from ...attrs import LANG
from ...compat import copy_reg from ...compat import copy_reg
from ...errors import Errors from ...errors import Errors
from ...language import Language from ...language import Language
from ...symbols import POS from ...symbols import POS
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer from ...util import DummyTokenizer, registry
from ... import util from ... import util
DEFAULT_CONFIG = """
[nlp]
lang = "ja"
stop_words = {"@language_data": "spacy.ja.stop_words"}
[nlp.tokenizer]
@tokenizers = "spacy.JapaneseTokenizer.v1"
split_mode = null
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
"""
@registry.language_data("spacy.ja.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
def create_japanese_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp):
return JapaneseTokenizer(nlp, split_mode=split_mode)
return japanese_tokenizer_factory
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
self.vocab = nlp.vocab
self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode)
def __call__(self, text: str) -> Doc:
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
sudachipy_tokens = self.tokenizer.tokenize(text)
dtokens = self._get_dtokens(sudachipy_tokens)
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
# create Doc with tag bi-gram based part-of-speech identification rules
words, tags, inflections, lemmas, readings, sub_tokens_list = (
zip(*dtokens) if dtokens else [[]] * 6
)
sub_tokens_list = list(sub_tokens_list)
doc = Doc(self.vocab, words=words, spaces=spaces)
next_pos = None # for bi-gram rules
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
token.tag_ = dtoken.tag
if next_pos: # already identified in previous iteration
token.pos = next_pos
next_pos = None
else:
token.pos, next_pos = resolve_pos(
token.orth_,
dtoken.tag,
tags[idx + 1] if idx + 1 < len(tags) else None,
)
# if there's no lemma info (it's an unk) just use the surface
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
doc.user_data["inflections"] = inflections
doc.user_data["reading_forms"] = readings
doc.user_data["sub_tokens"] = sub_tokens_list
return doc
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
sub_tokens_list = (
self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
)
dtokens = [
DetailedToken(
token.surface(), # orth
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
token.dictionary_form(), # lemma
token.reading_form(), # user_data['reading_forms']
sub_tokens_list[idx]
if sub_tokens_list
else None, # user_data['sub_tokens']
)
for idx, token in enumerate(sudachipy_tokens)
if len(token.surface()) > 0
# remove empty tokens which can be produced with characters like … that
]
# Sudachi normalizes internally and outputs each space char as a token.
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
return [
t
for idx, t in enumerate(dtokens)
if idx == 0
or not t.surface.isspace()
or t.tag != "空白"
or not dtokens[idx - 1].surface.isspace()
or dtokens[idx - 1].tag != "空白"
]
def _get_sub_tokens(self, sudachipy_tokens):
if (
self.split_mode is None or self.split_mode == "A"
): # do nothing for default split mode
return None
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
for token in sudachipy_tokens:
sub_a = token.split(self.tokenizer.SplitMode.A)
if len(sub_a) == 1: # no sub tokens
sub_tokens_list.append(None)
elif self.split_mode == "B":
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
else: # "C"
sub_b = token.split(self.tokenizer.SplitMode.B)
if len(sub_a) == len(sub_b):
dtokens = self._get_dtokens(sub_a, False)
sub_tokens_list.append([dtokens, dtokens])
else:
sub_tokens_list.append(
[
self._get_dtokens(sub_a, False),
self._get_dtokens(sub_b, False),
]
)
return sub_tokens_list
def _get_config(self) -> Dict[str, Any]:
return {"split_mode": self.split_mode}
def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.split_mode = config.get("split_mode", None)
def to_bytes(self, **kwargs) -> bytes:
serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
return util.to_bytes(serializers, [])
def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer":
deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
util.from_bytes(data, deserializers, [])
self.tokenizer = try_sudachi_import(self.split_mode)
return self
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
path = util.ensure_path(path)
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
return util.to_disk(path, serializers, [])
def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
path = util.ensure_path(path)
serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
util.from_disk(path, serializers, [])
self.tokenizer = try_sudachi_import(self.split_mode)
return self
class JapaneseDefaults(Language.Defaults):
tag_map = TAG_MAP
syntax_iterators = SYNTAX_ITERATORS
class Japanese(Language):
lang = "ja"
Defaults = JapaneseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
# Hold the attributes we need with convenient names # Hold the attributes we need with convenient names
DetailedToken = namedtuple( DetailedToken = namedtuple(
"DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"] "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
@ -133,161 +299,6 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
return text_dtokens, text_spaces return text_dtokens, text_spaces
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None, config={}):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
self.split_mode = config.get("split_mode", None)
self.tokenizer = try_sudachi_import(self.split_mode)
def __call__(self, text):
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
sudachipy_tokens = self.tokenizer.tokenize(text)
dtokens = self._get_dtokens(sudachipy_tokens)
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
# create Doc with tag bi-gram based part-of-speech identification rules
words, tags, inflections, lemmas, readings, sub_tokens_list = (
zip(*dtokens) if dtokens else [[]] * 6
)
sub_tokens_list = list(sub_tokens_list)
doc = Doc(self.vocab, words=words, spaces=spaces)
next_pos = None # for bi-gram rules
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
token.tag_ = dtoken.tag
if next_pos: # already identified in previous iteration
token.pos = next_pos
next_pos = None
else:
token.pos, next_pos = resolve_pos(
token.orth_,
dtoken.tag,
tags[idx + 1] if idx + 1 < len(tags) else None,
)
# if there's no lemma info (it's an unk) just use the surface
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
doc.user_data["inflections"] = inflections
doc.user_data["reading_forms"] = readings
doc.user_data["sub_tokens"] = sub_tokens_list
return doc
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
sub_tokens_list = (
self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
)
dtokens = [
DetailedToken(
token.surface(), # orth
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
token.dictionary_form(), # lemma
token.reading_form(), # user_data['reading_forms']
sub_tokens_list[idx]
if sub_tokens_list
else None, # user_data['sub_tokens']
)
for idx, token in enumerate(sudachipy_tokens)
if len(token.surface()) > 0
# remove empty tokens which can be produced with characters like … that
]
# Sudachi normalizes internally and outputs each space char as a token.
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
return [
t
for idx, t in enumerate(dtokens)
if idx == 0
or not t.surface.isspace()
or t.tag != "空白"
or not dtokens[idx - 1].surface.isspace()
or dtokens[idx - 1].tag != "空白"
]
def _get_sub_tokens(self, sudachipy_tokens):
if (
self.split_mode is None or self.split_mode == "A"
): # do nothing for default split mode
return None
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
for token in sudachipy_tokens:
sub_a = token.split(self.tokenizer.SplitMode.A)
if len(sub_a) == 1: # no sub tokens
sub_tokens_list.append(None)
elif self.split_mode == "B":
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
else: # "C"
sub_b = token.split(self.tokenizer.SplitMode.B)
if len(sub_a) == len(sub_b):
dtokens = self._get_dtokens(sub_a, False)
sub_tokens_list.append([dtokens, dtokens])
else:
sub_tokens_list.append(
[
self._get_dtokens(sub_a, False),
self._get_dtokens(sub_b, False),
]
)
return sub_tokens_list
def _get_config(self):
config = OrderedDict((("split_mode", self.split_mode),))
return config
def _set_config(self, config={}):
self.split_mode = config.get("split_mode", None)
def to_bytes(self, **kwargs):
serializers = OrderedDict(
(("cfg", lambda: srsly.json_dumps(self._get_config())),)
)
return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs):
deserializers = OrderedDict(
(("cfg", lambda b: self._set_config(srsly.json_loads(b))),)
)
util.from_bytes(data, deserializers, [])
self.tokenizer = try_sudachi_import(self.split_mode)
return self
def to_disk(self, path, **kwargs):
path = util.ensure_path(path)
serializers = OrderedDict(
(("cfg", lambda p: srsly.write_json(p, self._get_config())),)
)
return util.to_disk(path, serializers, [])
def from_disk(self, path, **kwargs):
path = util.ensure_path(path)
serializers = OrderedDict(
(("cfg", lambda p: self._set_config(srsly.read_json(p))),)
)
util.from_disk(path, serializers, [])
self.tokenizer = try_sudachi_import(self.split_mode)
class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "ja"
stop_words = STOP_WORDS
tag_map = TAG_MAP
syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@classmethod
def create_tokenizer(cls, nlp=None, config={}):
return JapaneseTokenizer(cls, nlp, config)
class Japanese(Language):
lang = "ja"
Defaults = JapaneseDefaults
def make_doc(self, text):
return self.tokenizer(text)
def pickle_japanese(instance): def pickle_japanese(instance):
return Japanese, tuple() return Japanese, tuple()

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class KannadaDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "kn" lang = "kn"
stop_words = STOP_WORDS stop_words = {"@language_data": "spacy.kn.stop_words"}
"""
@registry.language_data("spacy.kn.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Kannada(Language): class Kannada(Language):
lang = "kn" lang = "kn"
Defaults = KannadaDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Kannada"] __all__ = ["Kannada"]

View File

@ -1,51 +1,52 @@
from typing import Set, Optional, Any, Dict
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from ...attrs import LANG
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...compat import copy_reg from ...compat import copy_reg
from ...util import DummyTokenizer from ...util import DummyTokenizer, registry
def try_mecab_import(): DEFAULT_CONFIG = """
try: [nlp]
from natto import MeCab lang = "ko"
stop_words = {"@language_data": "spacy.ko.stop_words"}
return MeCab [nlp.tokenizer]
except ImportError: @tokenizers = "spacy.KoreanTokenizer.v1"
raise ImportError(
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " [nlp.writing_system]
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " direction = "ltr"
"and [natto-py](https://github.com/buruzaemon/natto-py)" has_case = false
) has_letters = false
"""
# fmt: on @registry.language_data("spacy.ko.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
def check_spaces(text, tokens): @registry.tokenizers("spacy.KoreanTokenizer.v1")
prev_end = -1 def create_korean_tokenizer():
start = 0 def korean_tokenizer_factory(nlp):
for token in tokens: return KoreanTokenizer(nlp)
idx = text.find(token, start)
if prev_end > 0: return korean_tokenizer_factory
yield prev_end != idx
prev_end = idx + len(token)
start = prev_end
if start > 0:
yield False
class KoreanTokenizer(DummyTokenizer): class KoreanTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None): def __init__(self, nlp: Optional[Language] = None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.vocab = nlp.vocab
MeCab = try_mecab_import() MeCab = try_mecab_import()
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
def __del__(self): def __del__(self):
self.mecab_tokenizer.__del__() self.mecab_tokenizer.__del__()
def __call__(self, text): def __call__(self, text: str) -> Doc:
dtokens = list(self.detailed_tokens(text)) dtokens = list(self.detailed_tokens(text))
surfaces = [dt["surface"] for dt in dtokens] surfaces = [dt["surface"] for dt in dtokens]
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
@ -56,7 +57,7 @@ class KoreanTokenizer(DummyTokenizer):
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc return doc
def detailed_tokens(self, text): def detailed_tokens(self, text: str) -> Dict[str, Any]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for node in self.mecab_tokenizer.parse(text, as_nodes=True): for node in self.mecab_tokenizer.parse(text, as_nodes=True):
@ -72,23 +73,39 @@ class KoreanTokenizer(DummyTokenizer):
class KoreanDefaults(Language.Defaults): class KoreanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "ko"
stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@classmethod
def create_tokenizer(cls, nlp=None):
return KoreanTokenizer(cls, nlp)
class Korean(Language): class Korean(Language):
lang = "ko" lang = "ko"
Defaults = KoreanDefaults Defaults = KoreanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
def make_doc(self, text):
return self.tokenizer(text) def try_mecab_import() -> None:
try:
from natto import MeCab
return MeCab
except ImportError:
raise ImportError(
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
)
def check_spaces(text, tokens):
prev_end = -1
start = 0
for token in tokens:
idx = text.find(token, start)
if prev_end > 0:
yield prev_end != idx
prev_end = idx + len(token)
start = prev_end
if start > 0:
yield False
def pickle_korean(instance): def pickle_korean(instance):

View File

@ -1,26 +1,49 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import update_exc, registry
from ...util import update_exc
DEFAULT_CONFIG = """
[nlp]
lang = "lb"
stop_words = {"@language_data": "spacy.lb.stop_words"}
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.lb.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.lb.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class LuxembourgishDefaults(Language.Defaults): class LuxembourgishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "lb"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
class Luxembourgish(Language): class Luxembourgish(Language):
lang = "lb" lang = "lb"
Defaults = LuxembourgishDefaults Defaults = LuxembourgishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Luxembourgish"] __all__ = ["Luxembourgish"]

View File

@ -1,3 +1,4 @@
from typing import Set
import unicodedata import unicodedata
import re import re
@ -21,21 +22,21 @@ _tlds = set(
) )
def is_punct(text): def is_punct(text: str) -> bool:
for char in text: for char in text:
if not unicodedata.category(char).startswith("P"): if not unicodedata.category(char).startswith("P"):
return False return False
return True return True
def is_ascii(text): def is_ascii(text: str) -> bool:
for char in text: for char in text:
if ord(char) >= 128: if ord(char) >= 128:
return False return False
return True return True
def like_num(text): def like_num(text: str) -> bool:
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]
# can be overwritten by lang with list of number words # can be overwritten by lang with list of number words
@ -49,64 +50,31 @@ def like_num(text):
return False return False
def is_bracket(text): def is_bracket(text: str) -> bool:
brackets = ("(", ")", "[", "]", "{", "}", "<", ">") brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
return text in brackets return text in brackets
def is_quote(text): def is_quote(text: str) -> bool:
quotes = ( # fmt: off
'"', quotes = ('"', "'", "`", "«", "»", "", "", "", "", "", "", "", "", "", "", "", "", "''", "``")
"'", # fmt: on
"`",
"«",
"»",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"''",
"``",
)
return text in quotes return text in quotes
def is_left_punct(text): def is_left_punct(text: str) -> bool:
left_punct = ( # fmt: off
"(", left_punct = ("(", "[", "{", "<", '"', "'", "«", "", "", "", "", "", "", "", "", "``")
"[", # fmt: on
"{",
"<",
'"',
"'",
"«",
"",
"",
"",
"",
"",
"",
"",
"",
"``",
)
return text in left_punct return text in left_punct
def is_right_punct(text): def is_right_punct(text: str) -> bool:
right_punct = (")", "]", "}", ">", '"', "'", "»", "", "", "", "", "''") right_punct = (")", "]", "}", ">", '"', "'", "»", "", "", "", "", "''")
return text in right_punct return text in right_punct
def is_currency(text): def is_currency(text: str) -> bool:
# can be overwritten by lang with list of currency words, e.g. dollar, euro # can be overwritten by lang with list of currency words, e.g. dollar, euro
for char in text: for char in text:
if unicodedata.category(char) != "Sc": if unicodedata.category(char) != "Sc":
@ -114,11 +82,11 @@ def is_currency(text):
return True return True
def like_email(text): def like_email(text: str) -> bool:
return bool(_like_email(text)) return bool(_like_email(text))
def like_url(text): def like_url(text: str) -> bool:
# We're looking for things that function in text like URLs. So, valid URL # We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good. # or not, anything they say http:// is going to be good.
if text.startswith("http://") or text.startswith("https://"): if text.startswith("http://") or text.startswith("https://"):
@ -144,7 +112,7 @@ def like_url(text):
return False return False
def word_shape(text): def word_shape(text: str) -> str:
if len(text) >= 100: if len(text) >= 100:
return "LONG" return "LONG"
shape = [] shape = []
@ -171,46 +139,52 @@ def word_shape(text):
return "".join(shape) return "".join(shape)
def lower(string): def lower(string: str) -> str:
return string.lower() return string.lower()
def prefix(string): def prefix(string: str) -> str:
return string[0] return string[0]
def suffix(string): def suffix(string: str) -> str:
return string[-3:] return string[-3:]
def is_alpha(string): def is_alpha(string: str) -> bool:
return string.isalpha() return string.isalpha()
def is_digit(string): def is_digit(string: str) -> bool:
return string.isdigit() return string.isdigit()
def is_lower(string): def is_lower(string: str) -> bool:
return string.islower() return string.islower()
def is_space(string): def is_space(string: str) -> bool:
return string.isspace() return string.isspace()
def is_title(string): def is_title(string: str) -> bool:
return string.istitle() return string.istitle()
def is_upper(string): def is_upper(string: str) -> bool:
return string.isupper() return string.isupper()
def is_stop(string, stops=set()): def is_stop(string: str, stops: Set[str] = set()) -> bool:
return string.lower() in stops return string.lower() in stops
def get_lang(text: str, lang: str = "") -> str:
# This function is partially applied so lang code can be passed in
# automatically while still allowing pickling
return lang
LEX_ATTRS = { LEX_ATTRS = {
attrs.LOWER: lower, attrs.LOWER: lower,
attrs.NORM: lower, attrs.NORM: lower,

View File

@ -1,28 +1,35 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "lij"
stop_words = {"@language_data": "spacy.lij.stop_words"}
"""
@registry.language_data("spacy.lij.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class LigurianDefaults(Language.Defaults): class LigurianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "lij"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
class Ligurian(Language): class Ligurian(Language):
lang = "lij" lang = "lij"
Defaults = LigurianDefaults Defaults = LigurianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Ligurian"] __all__ = ["Ligurian"]

View File

@ -1,27 +1,41 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
def _return_lt(_): DEFAULT_CONFIG = """
return "lt" [nlp]
lang = "lt"
stop_words = {"@language_data": "spacy.lt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.lt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.lt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class LithuanianDefaults(Language.Defaults): class LithuanianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = _return_lt
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters.update(LEX_ATTRS)
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
mod_base_exceptions = { mod_base_exceptions = {
@ -29,12 +43,12 @@ class LithuanianDefaults(Language.Defaults):
} }
del mod_base_exceptions["8)"] del mod_base_exceptions["8)"]
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Lithuanian(Language): class Lithuanian(Language):
lang = "lt" lang = "lt"
Defaults = LithuanianDefaults Defaults = LithuanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Lithuanian"] __all__ = ["Lithuanian"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class LatvianDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "lv" lang = "lv"
stop_words = STOP_WORDS stop_words = {"@language_data": "spacy.lv.stop_words"}
"""
@registry.language_data("spacy.lv.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Latvian(Language): class Latvian(Language):
lang = "lv" lang = "lv"
Defaults = LatvianDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Latvian"] __all__ = ["Latvian"]

View File

@ -1,15 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
class MalayalamDefaults(Language.Defaults): DEFAULT_CONFIG = """
stop_words = STOP_WORDS [nlp]
lang = "ml"
stop_words = {"@language_data": "spacy.ml.stop_words"}
"""
@registry.language_data("spacy.ml.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Malayalam(Language): class Malayalam(Language):
lang = "ml" lang = "ml"
Defaults = MalayalamDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Malayalam"] __all__ = ["Malayalam"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class MarathiDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "mr" lang = "af"
stop_words = STOP_WORDS stop_words = {"@language_data": "spacy.mr.stop_words"}
"""
@registry.language_data("spacy.mr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Marathi(Language): class Marathi(Language):
lang = "mr" lang = "mr"
Defaults = MarathiDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Marathi"] __all__ = ["Marathi"]

View File

@ -1,33 +1,47 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "nb"
stop_words = {"@language_data": "spacy.nb.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.nb.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class NorwegianDefaults(Language.Defaults): class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "nb"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
class Norwegian(Language): class Norwegian(Language):
lang = "nb" lang = "nb"
Defaults = NorwegianDefaults Defaults = NorwegianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Norwegian"] __all__ = ["Norwegian"]

View File

@ -1,23 +1,33 @@
# coding: utf8 from typing import Set, Dict, Callable, Any
from __future__ import unicode_literals from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class NepaliDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters.update(LEX_ATTRS) lang = "ne"
lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code stop_words = {"@language_data": "spacy.ne.stop_words"}
stop_words = STOP_WORDS lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
"""
@registry.language_data("spacy.ne.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ne.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Nepali(Language): class Nepali(Language):
lang = "ne" lang = "ne"
Defaults = NepaliDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Nepali"] __all__ = ["Nepali"]

View File

@ -1,3 +1,6 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -5,36 +8,51 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer from .lemmatizer import DutchLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lookups import Lookups from ...util import update_exc, registry
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "nl"
stop_words = {"@language_data": "spacy.nl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.DutchLemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.nl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.nl.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
return DutchLemmatizer(data_paths=data_paths)
class DutchDefaults(Language.Defaults): class DutchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "nl"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return DutchLemmatizer(lookups)
class Dutch(Language): class Dutch(Language):
lang = "nl" lang = "nl"
Defaults = DutchDefaults Defaults = DutchDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Dutch"] __all__ = ["Dutch"]

View File

@ -1,3 +1,5 @@
from typing import Optional, List, Dict, Tuple
from ...lemmatizer import Lemmatizer from ...lemmatizer import Lemmatizer
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
@ -34,7 +36,9 @@ class DutchLemmatizer(Lemmatizer):
"num": "num", "num": "num",
} }
def __call__(self, string, univ_pos, morphology=None): def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
# Difference 1: self.rules is assumed to be non-None, so no # Difference 1: self.rules is assumed to be non-None, so no
# 'is None' check required. # 'is None' check required.
# String lowercased from the get-go. All lemmatization results in # String lowercased from the get-go. All lemmatization results in
@ -92,7 +96,7 @@ class DutchLemmatizer(Lemmatizer):
# Overrides parent method so that a lowercased version of the string is # Overrides parent method so that a lowercased version of the string is
# used to search the lookup table. This is necessary because our lookup # used to search the lookup table. This is necessary because our lookup
# table consists entirely of lowercase keys. # table consists entirely of lowercase keys.
def lookup(self, string, orth=None): def lookup(self, string: str, orth: Optional[int] = None) -> str:
lookup_table = self.lookups.get_table("lemma_lookup", {}) lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower() string = string.lower()
if orth is not None: if orth is not None:
@ -102,7 +106,13 @@ class DutchLemmatizer(Lemmatizer):
# Reimplemented to focus more on application of suffix rules and to return # Reimplemented to focus more on application of suffix rules and to return
# as early as possible. # as early as possible.
def lemmatize(self, string, index, exceptions, rules): def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> Tuple[List[str], bool]:
# returns (forms, is_known: bool) # returns (forms, is_known: bool)
oov_forms = [] oov_forms = []
for old, new in rules: for old, new in rules:

View File

@ -1,43 +1,60 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import PolishLemmatizer from .lemmatizer import PolishLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import registry
from ...util import add_lookups
from ...lookups import Lookups
DEFAULT_CONFIG = """
[nlp]
lang = "pl"
stop_words = {"@language_data": "spacy.pl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.PolishLemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.pl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.pl.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
return PolishLemmatizer(data_paths=data_paths)
class PolishDefaults(Language.Defaults): class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "pl"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
mod_base_exceptions = { mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
} }
tokenizer_exceptions = mod_base_exceptions tokenizer_exceptions = mod_base_exceptions
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return PolishLemmatizer(lookups)
class Polish(Language): class Polish(Language):
lang = "pl" lang = "pl"
Defaults = PolishDefaults Defaults = PolishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Polish"] __all__ = ["Polish"]

View File

@ -1,3 +1,5 @@
from typing import Optional, List, Dict
from ...lemmatizer import Lemmatizer from ...lemmatizer import Lemmatizer
from ...parts_of_speech import NAMES from ...parts_of_speech import NAMES
@ -7,30 +9,29 @@ class PolishLemmatizer(Lemmatizer):
# dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS. # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
# It utilizes some prefix based improvements for verb and adjectives # It utilizes some prefix based improvements for verb and adjectives
# lemmatization, as well as case-sensitive lemmatization for nouns. # lemmatization, as well as case-sensitive lemmatization for nouns.
def __call__(self, string, univ_pos, morphology=None): def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
if isinstance(univ_pos, int): if isinstance(univ_pos, int):
univ_pos = NAMES.get(univ_pos, "X") univ_pos = NAMES.get(univ_pos, "X")
univ_pos = univ_pos.upper() univ_pos = univ_pos.upper()
lookup_pos = univ_pos.lower() lookup_pos = univ_pos.lower()
if univ_pos == "PROPN": if univ_pos == "PROPN":
lookup_pos = "noun" lookup_pos = "noun"
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {}) lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
if univ_pos == "NOUN": if univ_pos == "NOUN":
return self.lemmatize_noun(string, morphology, lookup_table) return self.lemmatize_noun(string, morphology, lookup_table)
if univ_pos != "PROPN": if univ_pos != "PROPN":
string = string.lower() string = string.lower()
if univ_pos == "ADJ": if univ_pos == "ADJ":
return self.lemmatize_adj(string, morphology, lookup_table) return self.lemmatize_adj(string, morphology, lookup_table)
elif univ_pos == "VERB": elif univ_pos == "VERB":
return self.lemmatize_verb(string, morphology, lookup_table) return self.lemmatize_verb(string, morphology, lookup_table)
return [lookup_table.get(string, string.lower())] return [lookup_table.get(string, string.lower())]
def lemmatize_adj(self, string, morphology, lookup_table): def lemmatize_adj(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
# this method utilizes different procedures for adjectives # this method utilizes different procedures for adjectives
# with 'nie' and 'naj' prefixes # with 'nie' and 'naj' prefixes
if string[:3] == "nie": if string[:3] == "nie":
@ -41,25 +42,26 @@ class PolishLemmatizer(Lemmatizer):
return [lookup_table[naj_search_string]] return [lookup_table[naj_search_string]]
if search_string in lookup_table: if search_string in lookup_table:
return [lookup_table[search_string]] return [lookup_table[search_string]]
if string[:3] == "naj": if string[:3] == "naj":
naj_search_string = string[3:] naj_search_string = string[3:]
if naj_search_string in lookup_table: if naj_search_string in lookup_table:
return [lookup_table[naj_search_string]] return [lookup_table[naj_search_string]]
return [lookup_table.get(string, string)] return [lookup_table.get(string, string)]
def lemmatize_verb(self, string, morphology, lookup_table): def lemmatize_verb(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
# this method utilizes a different procedure for verbs # this method utilizes a different procedure for verbs
# with 'nie' prefix # with 'nie' prefix
if string[:3] == "nie": if string[:3] == "nie":
search_string = string[3:] search_string = string[3:]
if search_string in lookup_table: if search_string in lookup_table:
return [lookup_table[search_string]] return [lookup_table[search_string]]
return [lookup_table.get(string, string)] return [lookup_table.get(string, string)]
def lemmatize_noun(self, string, morphology, lookup_table): def lemmatize_noun(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
# this method is case-sensitive, in order to work # this method is case-sensitive, in order to work
# for incorrectly tagged proper names # for incorrectly tagged proper names
if string != string.lower(): if string != string.lower():
@ -68,11 +70,16 @@ class PolishLemmatizer(Lemmatizer):
elif string in lookup_table: elif string in lookup_table:
return [lookup_table[string]] return [lookup_table[string]]
return [string.lower()] return [string.lower()]
return [lookup_table.get(string, string)] return [lookup_table.get(string, string)]
def lookup(self, string, orth=None): def lookup(self, string: str, orth: Optional[int] = None) -> str:
return string.lower() return string.lower()
def lemmatize(self, string, index, exceptions, rules): def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
raise NotImplementedError raise NotImplementedError

View File

@ -1,20 +1,42 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import update_exc, registry
from ...util import update_exc
DEFAULT_CONFIG = """
[nlp]
lang = "pt"
stop_words = {"@language_data": "spacy.pt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.pt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.pt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class PortugueseDefaults(Language.Defaults): class PortugueseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "pt"
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
@ -22,6 +44,7 @@ class PortugueseDefaults(Language.Defaults):
class Portuguese(Language): class Portuguese(Language):
lang = "pt" lang = "pt"
Defaults = PortugueseDefaults Defaults = PortugueseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Portuguese"] __all__ = ["Portuguese"]

View File

@ -1,27 +1,40 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
# Lemma data note: # Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/ # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
# Replaced characters using cedillas with the correct ones (ș and ț) # Replaced characters using cedillas with the correct ones (ș and ț)
DEFAULT_CONFIG = """
[nlp]
lang = "ro"
stop_words = {"@language_data": "spacy.ro.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.ro.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class RomanianDefaults(Language.Defaults): class RomanianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "ro"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
@ -30,6 +43,7 @@ class RomanianDefaults(Language.Defaults):
class Romanian(Language): class Romanian(Language):
lang = "ro" lang = "ro"
Defaults = RomanianDefaults Defaults = RomanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Romanian"] __all__ = ["Romanian"]

View File

@ -1,32 +1,49 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...util import update_exc from ...util import update_exc, registry
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG
DEFAULT_CONFIG = """
[nlp]
lang = "ru"
stop_words = {"@language_data": "spacy.ru.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.RussianLemmatizer.v1"
"""
@registry.language_data("spacy.ru.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ru.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
def create_russian_lemmatizer() -> RussianLemmatizer:
return RussianLemmatizer()
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ru"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return RussianLemmatizer(lookups)
class Russian(Language): class Russian(Language):
lang = "ru" lang = "ru"
Defaults = RussianDefaults Defaults = RussianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Russian"] __all__ = ["Russian"]

View File

@ -1,11 +1,17 @@
from typing import Optional, Tuple, Dict, List
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
from ...lemmatizer import Lemmatizer from ...lemmatizer import Lemmatizer
from ...lookups import Lookups
PUNCT_RULES = {"«": '"', "»": '"'}
class RussianLemmatizer(Lemmatizer): class RussianLemmatizer(Lemmatizer):
_morph = None _morph = None
def __init__(self, lookups=None): def __init__(self, lookups: Optional[Lookups] = None) -> None:
super(RussianLemmatizer, self).__init__(lookups) super(RussianLemmatizer, self).__init__(lookups)
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
@ -19,15 +25,15 @@ class RussianLemmatizer(Lemmatizer):
if RussianLemmatizer._morph is None: if RussianLemmatizer._morph is None:
RussianLemmatizer._morph = MorphAnalyzer() RussianLemmatizer._morph = MorphAnalyzer()
def __call__(self, string, univ_pos, morphology=None): def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
univ_pos = self.normalize_univ_pos(univ_pos) univ_pos = self.normalize_univ_pos(univ_pos)
if univ_pos == "PUNCT": if univ_pos == "PUNCT":
return [PUNCT_RULES.get(string, string)] return [PUNCT_RULES.get(string, string)]
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
# Skip unchangeable pos # Skip unchangeable pos
return [string.lower()] return [string.lower()]
analyses = self._morph.parse(string) analyses = self._morph.parse(string)
filtered_analyses = [] filtered_analyses = []
for analysis in analyses: for analysis in analyses:
@ -39,12 +45,10 @@ class RussianLemmatizer(Lemmatizer):
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN") analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
): ):
filtered_analyses.append(analysis) filtered_analyses.append(analysis)
if not len(filtered_analyses): if not len(filtered_analyses):
return [string.lower()] return [string.lower()]
if morphology is None or (len(morphology) == 1 and POS in morphology): if morphology is None or (len(morphology) == 1 and POS in morphology):
return list(set([analysis.normal_form for analysis in filtered_analyses])) return list(set([analysis.normal_form for analysis in filtered_analyses]))
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
features_to_compare = ["Case", "Number", "Gender"] features_to_compare = ["Case", "Number", "Gender"]
elif univ_pos == "NUM": elif univ_pos == "NUM":
@ -61,7 +65,6 @@ class RussianLemmatizer(Lemmatizer):
"VerbForm", "VerbForm",
"Voice", "Voice",
] ]
analyses, filtered_analyses = filtered_analyses, [] analyses, filtered_analyses = filtered_analyses, []
for analysis in analyses: for analysis in analyses:
_, analysis_morph = oc2ud(str(analysis.tag)) _, analysis_morph = oc2ud(str(analysis.tag))
@ -74,16 +77,14 @@ class RussianLemmatizer(Lemmatizer):
break break
else: else:
filtered_analyses.append(analysis) filtered_analyses.append(analysis)
if not len(filtered_analyses): if not len(filtered_analyses):
return [string.lower()] return [string.lower()]
return list(set([analysis.normal_form for analysis in filtered_analyses])) return list(set([analysis.normal_form for analysis in filtered_analyses]))
@staticmethod @staticmethod
def normalize_univ_pos(univ_pos): def normalize_univ_pos(univ_pos: str) -> Optional[str]:
if isinstance(univ_pos, str): if isinstance(univ_pos, str):
return univ_pos.upper() return univ_pos.upper()
symbols_to_str = { symbols_to_str = {
ADJ: "ADJ", ADJ: "ADJ",
DET: "DET", DET: "DET",
@ -98,14 +99,14 @@ class RussianLemmatizer(Lemmatizer):
return symbols_to_str[univ_pos] return symbols_to_str[univ_pos]
return None return None
def lookup(self, string, orth=None): def lookup(self, string: str, orth: Optional[int] = None) -> str:
analyses = self._morph.parse(string) analyses = self._morph.parse(string)
if len(analyses) == 1: if len(analyses) == 1:
return analyses[0].normal_form return analyses[0].normal_form
return string return string
def oc2ud(oc_tag): def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
gram_map = { gram_map = {
"_POS": { "_POS": {
"ADJF": "ADJ", "ADJF": "ADJ",
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
"Voice": {"actv": "Act", "pssv": "Pass"}, "Voice": {"actv": "Act", "pssv": "Pass"},
"Abbr": {"Abbr": "Yes"}, "Abbr": {"Abbr": "Yes"},
} }
pos = "X" pos = "X"
morphology = dict() morphology = dict()
unmatched = set() unmatched = set()
grams = oc_tag.replace(" ", ",").split(",") grams = oc_tag.replace(" ", ",").split(",")
for gram in grams: for gram in grams:
match = False match = False
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
morphology[categ] = gmap[gram] morphology[categ] = gmap[gram]
if not match: if not match:
unmatched.add(gram) unmatched.add(gram)
while len(unmatched) > 0: while len(unmatched) > 0:
gram = unmatched.pop() gram = unmatched.pop()
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"): if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
pos = "AUX" pos = "AUX"
elif gram == "Pltm": elif gram == "Pltm":
morphology["Number"] = "Ptan" morphology["Number"] = "Ptan"
return pos, morphology return pos, morphology
PUNCT_RULES = {"«": '"', "»": '"'}

View File

@ -1,20 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class SinhalaDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters.update(LEX_ATTRS) lang = "si"
lex_attr_getters[LANG] = lambda text: "si" stop_words = {"@language_data": "spacy.si.stop_words"}
stop_words = STOP_WORDS lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
"""
@registry.language_data("spacy.si.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.si.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Sinhala(Language): class Sinhala(Language):
lang = "si" lang = "si"
Defaults = SinhalaDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Sinhala"] __all__ = ["Sinhala"]

View File

@ -1,20 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class SlovakDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters.update(LEX_ATTRS) lang = "sk"
lex_attr_getters[LANG] = lambda text: "sk" stop_words = {"@language_data": "spacy.sk.stop_words"}
stop_words = STOP_WORDS lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
"""
@registry.language_data("spacy.sk.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sk.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Slovak(Language): class Slovak(Language):
lang = "sk" lang = "sk"
Defaults = SlovakDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Slovak"] __all__ = ["Slovak"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class SlovenianDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "sl" lang = "sl"
stop_words = STOP_WORDS stop_words = {"@language_data": "spacy.sl.stop_words"}
"""
@registry.language_data("spacy.sl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Slovenian(Language): class Slovenian(Language):
lang = "sl" lang = "sl"
Defaults = SlovenianDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Slovenian"] __all__ = ["Slovenian"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class AlbanianDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "sq" lang = "sq"
stop_words = STOP_WORDS stop_words = {"@language_data": "spacy.sq.stop_words"}
"""
@registry.language_data("spacy.sq.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Albanian(Language): class Albanian(Language):
lang = "sq" lang = "sq"
Defaults = AlbanianDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Albanian"] __all__ = ["Albanian"]

View File

@ -1,23 +1,47 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import update_exc, registry
from ...util import update_exc
DEFAULT_CONFIG = """
[nlp]
lang = "sr"
stop_words = {"@language_data": "spacy.sr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.sr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sr.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SerbianDefaults(Language.Defaults): class SerbianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "sr"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Serbian(Language): class Serbian(Language):
lang = "sr" lang = "sr"
Defaults = SerbianDefaults Defaults = SerbianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Serbian"] __all__ = ["Serbian"]

View File

@ -1,35 +1,54 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from .syntax_iterators import SYNTAX_ITERATORS
# Punctuation stolen from Danish # Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS DEFAULT_CONFIG = """
from ...language import Language [nlp]
from ...attrs import LANG, NORM lang = "sv"
from ...util import update_exc, add_lookups stop_words = {"@language_data": "spacy.sv.stop_words"}
from .syntax_iterators import SYNTAX_ITERATORS lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.sv.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sv.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SwedishDefaults(Language.Defaults): class SwedishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "sv"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
class Swedish(Language): class Swedish(Language):
lang = "sv" lang = "sv"
Defaults = SwedishDefaults Defaults = SwedishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Swedish"] __all__ = ["Swedish"]

View File

@ -1,20 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class TamilDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "ta" lang = "ta"
lex_attr_getters.update(LEX_ATTRS) stop_words = {"@language_data": "spacy.ta.stop_words"}
stop_words = STOP_WORDS lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
"""
@registry.language_data("spacy.ta.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ta.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Tamil(Language): class Tamil(Language):
lang = "ta" lang = "ta"
Defaults = TamilDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Tamil"] __all__ = ["Tamil"]

View File

@ -1,20 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
class TeluguDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters.update(LEX_ATTRS) lang = "te"
lex_attr_getters[LANG] = lambda text: "te" stop_words = {"@language_data": "spacy.te.stop_words"}
stop_words = STOP_WORDS lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
"""
@registry.language_data("spacy.te.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.te.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Telugu(Language): class Telugu(Language):
lang = "te" lang = "te"
Defaults = TeluguDefaults default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Telugu"] __all__ = ["Telugu"]

View File

@ -1,15 +1,44 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...attrs import LANG
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer from ...util import DummyTokenizer, registry
DEFAULT_CONFIG = """
[nlp]
lang = "th"
stop_words = {"@language_data": "spacy.th.stop_words"}
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
[nlp.tokenizer]
@tokenizers = "spacy.ThaiTokenizer.v1"
"""
@registry.language_data("spacy.th.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.th.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.ThaiTokenizer.v1")
def create_thai_tokenizer():
def thai_tokenizer_factory(nlp):
return ThaiTokenizer(nlp)
return thai_tokenizer_factory
class ThaiTokenizer(DummyTokenizer): class ThaiTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None): def __init__(self, nlp: Language) -> None:
try: try:
from pythainlp.tokenize import word_tokenize from pythainlp.tokenize import word_tokenize
except ImportError: except ImportError:
@ -17,34 +46,18 @@ class ThaiTokenizer(DummyTokenizer):
"The Thai tokenizer requires the PyThaiNLP library: " "The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/PyThaiNLP/pythainlp" "https://github.com/PyThaiNLP/pythainlp"
) )
self.word_tokenize = word_tokenize self.word_tokenize = word_tokenize
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.vocab = nlp.vocab
def __call__(self, text): def __call__(self, text: str) -> Doc:
words = list(self.word_tokenize(text)) words = list(self.word_tokenize(text))
spaces = [False] * len(words) spaces = [False] * len(words)
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda _text: "th"
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
@classmethod
def create_tokenizer(cls, nlp=None):
return ThaiTokenizer(cls, nlp)
class Thai(Language): class Thai(Language):
lang = "th" lang = "th"
Defaults = ThaiDefaults default_config = Config().from_str(DEFAULT_CONFIG)
def make_doc(self, text):
return self.tokenizer(text)
__all__ = ["Thai"] __all__ = ["Thai"]

View File

@ -1,31 +1,47 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
def _return_tl(_): DEFAULT_CONFIG = """
return "tl" [nlp]
lang = "tl"
stop_words = {"@language_data": "spacy.tl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.tl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.tl.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class TagalogDefaults(Language.Defaults): class TagalogDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = _return_tl
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Tagalog(Language): class Tagalog(Language):
lang = "tl" lang = "tl"
Defaults = TagalogDefaults Defaults = TagalogDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Tagalog"] __all__ = ["Tagalog"]

View File

@ -1,26 +1,40 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...util import update_exc, registry
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "tr"
stop_words = {"@language_data": "spacy.tr.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.tr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class TurkishDefaults(Language.Defaults): class TurkishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "tr"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Turkish(Language): class Turkish(Language):
lang = "tr" lang = "tr"
Defaults = TurkishDefaults Defaults = TurkishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Turkish"] __all__ = ["Turkish"]

View File

@ -1,28 +1,42 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...attrs import LANG
from ...language import Language from ...language import Language
from ...util import update_exc from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "tt"
stop_words = {"@language_data": "spacy.tt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
"""
@registry.language_data("spacy.tt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.tt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class TatarDefaults(Language.Defaults): class TatarDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "tt"
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES) infixes = tuple(TOKENIZER_INFIXES)
stop_words = STOP_WORDS
class Tatar(Language): class Tatar(Language):
lang = "tt" lang = "tt"
Defaults = TatarDefaults Defaults = TatarDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Tatar"] __all__ = ["Tatar"]

View File

@ -1,36 +1,49 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ...util import update_exc, registry
from ...util import update_exc, add_lookups
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
from .lemmatizer import UkrainianLemmatizer from .lemmatizer import UkrainianLemmatizer
class UkrainianDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "uk" lang = "uk"
lex_attr_getters[NORM] = add_lookups( stop_words = {"@language_data": "spacy.uk.stop_words"}
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
)
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
@classmethod [nlp.lemmatizer]
def create_lemmatizer(cls, nlp=None, lookups=None): @lemmatizers = "spacy.UkrainianLemmatizer.v1"
if lookups is None: """
lookups = Lookups()
return UkrainianLemmatizer(lookups)
@registry.language_data("spacy.uk.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.uk.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
return UkrainianLemmatizer()
class UkrainianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
class Ukrainian(Language): class Ukrainian(Language):
lang = "uk" lang = "uk"
Defaults = UkrainianDefaults Defaults = UkrainianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Ukrainian"] __all__ = ["Ukrainian"]

View File

@ -1,11 +1,17 @@
from typing import Optional, List, Tuple, Dict
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
from ...lookups import Lookups
from ...lemmatizer import Lemmatizer from ...lemmatizer import Lemmatizer
PUNCT_RULES = {"«": '"', "»": '"'}
class UkrainianLemmatizer(Lemmatizer): class UkrainianLemmatizer(Lemmatizer):
_morph = None _morph = None
def __init__(self, lookups=None): def __init__(self, lookups: Optional[Lookups] = None) -> None:
super(UkrainianLemmatizer, self).__init__(lookups) super(UkrainianLemmatizer, self).__init__(lookups)
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
@ -19,15 +25,15 @@ class UkrainianLemmatizer(Lemmatizer):
'"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"' '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
) )
def __call__(self, string, univ_pos, morphology=None): def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
univ_pos = self.normalize_univ_pos(univ_pos) univ_pos = self.normalize_univ_pos(univ_pos)
if univ_pos == "PUNCT": if univ_pos == "PUNCT":
return [PUNCT_RULES.get(string, string)] return [PUNCT_RULES.get(string, string)]
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
# Skip unchangeable pos # Skip unchangeable pos
return [string.lower()] return [string.lower()]
analyses = self._morph.parse(string) analyses = self._morph.parse(string)
filtered_analyses = [] filtered_analyses = []
for analysis in analyses: for analysis in analyses:
@ -39,12 +45,10 @@ class UkrainianLemmatizer(Lemmatizer):
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN") analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
): ):
filtered_analyses.append(analysis) filtered_analyses.append(analysis)
if not len(filtered_analyses): if not len(filtered_analyses):
return [string.lower()] return [string.lower()]
if morphology is None or (len(morphology) == 1 and POS in morphology): if morphology is None or (len(morphology) == 1 and POS in morphology):
return list(set([analysis.normal_form for analysis in filtered_analyses])) return list(set([analysis.normal_form for analysis in filtered_analyses]))
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
features_to_compare = ["Case", "Number", "Gender"] features_to_compare = ["Case", "Number", "Gender"]
elif univ_pos == "NUM": elif univ_pos == "NUM":
@ -61,7 +65,6 @@ class UkrainianLemmatizer(Lemmatizer):
"VerbForm", "VerbForm",
"Voice", "Voice",
] ]
analyses, filtered_analyses = filtered_analyses, [] analyses, filtered_analyses = filtered_analyses, []
for analysis in analyses: for analysis in analyses:
_, analysis_morph = oc2ud(str(analysis.tag)) _, analysis_morph = oc2ud(str(analysis.tag))
@ -74,16 +77,14 @@ class UkrainianLemmatizer(Lemmatizer):
break break
else: else:
filtered_analyses.append(analysis) filtered_analyses.append(analysis)
if not len(filtered_analyses): if not len(filtered_analyses):
return [string.lower()] return [string.lower()]
return list(set([analysis.normal_form for analysis in filtered_analyses])) return list(set([analysis.normal_form for analysis in filtered_analyses]))
@staticmethod @staticmethod
def normalize_univ_pos(univ_pos): def normalize_univ_pos(univ_pos: str) -> Optional[str]:
if isinstance(univ_pos, str): if isinstance(univ_pos, str):
return univ_pos.upper() return univ_pos.upper()
symbols_to_str = { symbols_to_str = {
ADJ: "ADJ", ADJ: "ADJ",
DET: "DET", DET: "DET",
@ -98,14 +99,14 @@ class UkrainianLemmatizer(Lemmatizer):
return symbols_to_str[univ_pos] return symbols_to_str[univ_pos]
return None return None
def lookup(self, string, orth=None): def lookup(self, string: str, orth: Optional[int] = None) -> str:
analyses = self._morph.parse(string) analyses = self._morph.parse(string)
if len(analyses) == 1: if len(analyses) == 1:
return analyses[0].normal_form return analyses[0].normal_form
return string return string
def oc2ud(oc_tag): def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
gram_map = { gram_map = {
"_POS": { "_POS": {
"ADJF": "ADJ", "ADJF": "ADJ",
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
"Voice": {"actv": "Act", "pssv": "Pass"}, "Voice": {"actv": "Act", "pssv": "Pass"},
"Abbr": {"Abbr": "Yes"}, "Abbr": {"Abbr": "Yes"},
} }
pos = "X" pos = "X"
morphology = dict() morphology = dict()
unmatched = set() unmatched = set()
grams = oc_tag.replace(" ", ",").split(",") grams = oc_tag.replace(" ", ",").split(",")
for gram in grams: for gram in grams:
match = False match = False
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
morphology[categ] = gmap[gram] morphology[categ] = gmap[gram]
if not match: if not match:
unmatched.add(gram) unmatched.add(gram)
while len(unmatched) > 0: while len(unmatched) > 0:
gram = unmatched.pop() gram = unmatched.pop()
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"): if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
pos = "AUX" pos = "AUX"
elif gram == "Pltm": elif gram == "Pltm":
morphology["Number"] = "Ptan" morphology["Number"] = "Ptan"
return pos, morphology return pos, morphology
PUNCT_RULES = {"«": '"', "»": '"'}

View File

@ -1,26 +1,53 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ur"
stop_words = {"@language_data": "spacy.ur.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.ur.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ur.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class UrduDefaults(Language.Defaults): class UrduDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ur"
tokenizer_exceptions = BASE_EXCEPTIONS tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Urdu(Language): class Urdu(Language):
lang = "ur" lang = "ur"
Defaults = UrduDefaults Defaults = UrduDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Urdu"] __all__ = ["Urdu"]

View File

@ -1,38 +1,62 @@
from ...attrs import LANG, NORM from typing import Set, Dict, Callable, Any
from ..norm_exceptions import BASE_NORMS from thinc.api import Config
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...util import add_lookups from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
class VietnameseDefaults(Language.Defaults): DEFAULT_CONFIG = """
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) [nlp]
lex_attr_getters[LANG] = lambda text: "vi" # for pickling lang = "vi"
lex_attr_getters[NORM] = add_lookups( stop_words = {"@language_data": "spacy.vi.stop_words"}
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
)
lex_attr_getters.update(LEX_ATTRS) [nlp.tokenizer]
stop_words = STOP_WORDS @tokenizers = "spacy.VietnameseTokenizer.v1"
use_pyvi = True use_pyvi = true
"""
class Vietnamese(Language): @registry.language_data("spacy.vi.stop_words")
lang = "vi" def stop_words() -> Set[str]:
Defaults = VietnameseDefaults # override defaults return STOP_WORDS
def make_doc(self, text):
if self.Defaults.use_pyvi: @registry.language_data("spacy.vi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.VietnameseTokenizer.v1")
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
return vietnamese_tokenizer_factory
class VietnameseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, use_pyvi: bool = False):
self.vocab = nlp.vocab
self.use_pyvi = use_pyvi
if self.use_pyvi:
try: try:
from pyvi import ViTokenizer from pyvi import ViTokenizer
self.ViTokenizer = ViTokenizer
except ImportError: except ImportError:
msg = ( msg = (
"Pyvi not installed. Either set Vietnamese.use_pyvi = False, " "Pyvi not installed. Either set use_pyvi = False, "
"or install it https://pypi.python.org/pypi/pyvi" "or install it https://pypi.python.org/pypi/pyvi"
) )
raise ImportError(msg) raise ImportError(msg)
words, spaces = ViTokenizer.spacy_tokenize(text)
def __call__(self, text: str) -> Doc:
if self.use_pyvi:
words, spaces = self.ViTokenizer.spacy_tokenize(text)
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
else: else:
words = [] words = []
@ -44,4 +68,9 @@ class Vietnamese(Language):
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
class Vietnamese(Language):
lang = "vi"
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Vietnamese"] __all__ = ["Vietnamese"]

View File

@ -1,17 +1,17 @@
from thinc.api import Config
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "xx"
"""
class MultiLanguageDefaults(Language.Defaults): class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) tokenizer_exceptions = BASE_EXCEPTIONS
lex_attr_getters[LANG] = lambda text: "xx"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
class MultiLanguage(Language): class MultiLanguage(Language):
@ -21,6 +21,7 @@ class MultiLanguage(Language):
lang = "xx" lang = "xx"
Defaults = MultiLanguageDefaults Defaults = MultiLanguageDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["MultiLanguage"] __all__ = ["MultiLanguage"]

View File

@ -1,21 +1,39 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "si"
stop_words = {"@language_data": "spacy.yo.stop_words"}
lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
"""
@registry.language_data("spacy.yo.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.yo.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class YorubaDefaults(Language.Defaults): class YorubaDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "yo"
stop_words = STOP_WORDS
tokenizer_exceptions = BASE_EXCEPTIONS tokenizer_exceptions = BASE_EXCEPTIONS
class Yoruba(Language): class Yoruba(Language):
lang = "yo" lang = "yo"
Defaults = YorubaDefaults Defaults = YorubaDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Yoruba"] __all__ = ["Yoruba"]

View File

@ -1,13 +1,15 @@
from typing import Optional, List, Set, Dict, Callable, Any
from enum import Enum
import tempfile import tempfile
import srsly import srsly
import warnings import warnings
from pathlib import Path from pathlib import Path
from collections import OrderedDict from thinc.api import Config
from ...attrs import LANG
from ...errors import Warnings, Errors from ...errors import Warnings, Errors
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer from ...util import DummyTokenizer, registry
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -16,88 +18,103 @@ from ... import util
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python" _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
DEFAULT_CONFIG = """
[nlp]
lang = "zh"
stop_words = {"@language_data": "spacy.zh.stop_words"}
lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
def try_jieba_import(segmenter): [nlp.tokenizer]
try: @tokenizers = "spacy.ChineseTokenizer.v1"
import jieba segmenter = "char"
pkuseg_model = null
pkuseg_user_dict = "default"
if segmenter == "jieba": [nlp.writing_system]
# segment a short text to have jieba initialize its cache in advance direction = "ltr"
list(jieba.cut("作为", cut_all=False)) has_case = false
has_letters = false
return jieba """
except ImportError:
if segmenter == "jieba":
msg = (
"Jieba not installed. To use jieba, install it with `pip "
" install jieba` or from https://github.com/fxsjy/jieba"
)
raise ImportError(msg)
def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict): class Segmenter(str, Enum):
try: char = "char"
import pkuseg jieba = "jieba"
pkuseg = "pkuseg"
if pkuseg_model: @classmethod
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) def values(cls):
elif segmenter == "pkuseg": return list(cls.__members__.keys())
msg = (
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
"was specified. Please provide the name of a pretrained model " @registry.language_data("spacy.zh.stop_words")
"or the path to a model with " def stop_words() -> Set[str]:
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; ' return STOP_WORDS
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
)
raise ValueError(msg) @registry.language_data("spacy.zh.lex_attr_getters")
except ImportError: def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
if segmenter == "pkuseg": return LEX_ATTRS
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg)
except FileNotFoundError: @registry.tokenizers("spacy.ChineseTokenizer.v1")
if segmenter == "pkuseg": def create_chinese_tokenizer(
msg = "Unable to load pkuseg model from: " + pkuseg_model segmenter: Segmenter = Segmenter.char,
raise FileNotFoundError(msg) pkuseg_model: Optional[str] = None,
pkuseg_user_dict: Optional[str] = "default",
):
def chinese_tokenizer_factory(nlp):
return ChineseTokenizer(
nlp,
segmenter=segmenter,
pkuseg_model=pkuseg_model,
pkuseg_user_dict=pkuseg_user_dict,
)
return chinese_tokenizer_factory
class ChineseTokenizer(DummyTokenizer): class ChineseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None, config={}): def __init__(
self.supported_segmenters = ("char", "jieba", "pkuseg") self,
self.configure_segmenter(config) nlp: Language,
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) segmenter: Segmenter = Segmenter.char,
# remove relevant settings from config so they're not also saved in pkuseg_model: Optional[str] = None,
# Language.meta pkuseg_user_dict: Optional[str] = None,
for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]: ):
if key in config: self.vocab = nlp.vocab
del config[key] if isinstance(segmenter, Segmenter): # we might have the Enum here
self.tokenizer = Language.Defaults().create_tokenizer(nlp) segmenter = segmenter.value
self.segmenter = segmenter
self.pkuseg_model = pkuseg_model
self.pkuseg_user_dict = pkuseg_user_dict
self.pkuseg_seg = None
self.jieba_seg = None
self.configure_segmenter(segmenter)
def configure_segmenter(self, config): def configure_segmenter(self, segmenter: str):
self.segmenter = "char" if segmenter not in Segmenter.values():
if "segmenter" in config: warn_msg = Warnings.W103.format(
if config["segmenter"] in self.supported_segmenters: lang="Chinese",
self.segmenter = config["segmenter"] segmenter=segmenter,
else: supported=", ".join(Segmenter.values()),
warn_msg = Warnings.W103.format( default="'char' (character segmentation)",
lang="Chinese", )
segmenter=config["segmenter"], warnings.warn(warn_msg)
supported=", ".join([repr(s) for s in self.supported_segmenters]), self.segmenter = Segmenter.char
default="'char' (character segmentation)",
)
warnings.warn(warn_msg)
self.jieba_seg = try_jieba_import(self.segmenter) self.jieba_seg = try_jieba_import(self.segmenter)
self.pkuseg_seg = try_pkuseg_import( self.pkuseg_seg = try_pkuseg_import(
self.segmenter, self.segmenter,
pkuseg_model=config.get("pkuseg_model", None), pkuseg_model=self.pkuseg_model,
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"), pkuseg_user_dict=self.pkuseg_user_dict,
) )
def __call__(self, text): def __call__(self, text: str) -> Doc:
if self.segmenter == "jieba": if self.segmenter == Segmenter.jieba:
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
(words, spaces) = util.get_words_and_spaces(words, text) (words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
elif self.segmenter == "pkuseg": elif self.segmenter == Segmenter.pkuseg:
if self.pkuseg_seg is None: if self.pkuseg_seg is None:
raise ValueError(Errors.E1000) raise ValueError(Errors.E1000)
words = self.pkuseg_seg.cut(text) words = self.pkuseg_seg.cut(text)
@ -105,11 +122,11 @@ class ChineseTokenizer(DummyTokenizer):
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
# warn if segmenter setting is not the only remaining option "char" # warn if segmenter setting is not the only remaining option "char"
if self.segmenter != "char": if self.segmenter != Segmenter.char:
warn_msg = Warnings.W103.format( warn_msg = Warnings.W103.format(
lang="Chinese", lang="Chinese",
segmenter=self.segmenter, segmenter=self.segmenter,
supported=", ".join([repr(s) for s in self.supported_segmenters]), supported=", ".join(Segmenter.values()),
default="'char' (character segmentation)", default="'char' (character segmentation)",
) )
warnings.warn(warn_msg) warnings.warn(warn_msg)
@ -119,33 +136,25 @@ class ChineseTokenizer(DummyTokenizer):
(words, spaces) = util.get_words_and_spaces(words, text) (words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
def pkuseg_update_user_dict(self, words, reset=False): def pkuseg_update_user_dict(self, words: List[str], reset: bool = False):
if self.segmenter == "pkuseg": if self.segmenter == Segmenter.pkuseg:
if reset: if reset:
try: try:
import pkuseg import pkuseg
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
except ImportError: except ImportError:
if self.segmenter == "pkuseg": msg = (
msg = ( "pkuseg not installed: unable to reset pkuseg "
"pkuseg not installed: unable to reset pkuseg " "user dict. Please " + _PKUSEG_INSTALL_MSG
"user dict. Please " + _PKUSEG_INSTALL_MSG )
) raise ImportError(msg)
raise ImportError(msg)
for word in words: for word in words:
self.pkuseg_seg.preprocesser.insert(word.strip(), "") self.pkuseg_seg.preprocesser.insert(word.strip(), "")
else: else:
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter) warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg) warnings.warn(warn_msg)
def _get_config(self):
config = OrderedDict((("segmenter", self.segmenter),))
return config
def _set_config(self, config={}):
self.configure_segmenter(config)
def to_bytes(self, **kwargs): def to_bytes(self, **kwargs):
pkuseg_features_b = b"" pkuseg_features_b = b""
pkuseg_weights_b = b"" pkuseg_weights_b = b""
@ -165,17 +174,11 @@ class ChineseTokenizer(DummyTokenizer):
sorted(list(self.pkuseg_seg.postprocesser.common_words)), sorted(list(self.pkuseg_seg.postprocesser.common_words)),
sorted(list(self.pkuseg_seg.postprocesser.other_words)), sorted(list(self.pkuseg_seg.postprocesser.other_words)),
) )
serializers = OrderedDict( serializers = {
( "pkuseg_features": lambda: pkuseg_features_b,
("cfg", lambda: srsly.json_dumps(self._get_config())), "pkuseg_weights": lambda: pkuseg_weights_b,
("pkuseg_features", lambda: pkuseg_features_b), "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
("pkuseg_weights", lambda: pkuseg_weights_b), }
(
"pkuseg_processors",
lambda: srsly.msgpack_dumps(pkuseg_processors_data),
),
)
)
return util.to_bytes(serializers, []) return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs): def from_bytes(self, data, **kwargs):
@ -190,14 +193,11 @@ class ChineseTokenizer(DummyTokenizer):
def deserialize_pkuseg_processors(b): def deserialize_pkuseg_processors(b):
pkuseg_data["processors_data"] = srsly.msgpack_loads(b) pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
deserializers = OrderedDict( deserializers = {
( "pkuseg_features": deserialize_pkuseg_features,
("cfg", lambda b: self._set_config(srsly.json_loads(b))), "pkuseg_weights": deserialize_pkuseg_weights,
("pkuseg_features", deserialize_pkuseg_features), "pkuseg_processors": deserialize_pkuseg_processors,
("pkuseg_weights", deserialize_pkuseg_weights), }
("pkuseg_processors", deserialize_pkuseg_processors),
)
)
util.from_bytes(data, deserializers, []) util.from_bytes(data, deserializers, [])
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]: if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
@ -245,13 +245,10 @@ class ChineseTokenizer(DummyTokenizer):
) )
srsly.write_msgpack(path, data) srsly.write_msgpack(path, data)
serializers = OrderedDict( serializers = {
( "pkuseg_model": lambda p: save_pkuseg_model(p),
("cfg", lambda p: srsly.write_json(p, self._get_config())), "pkuseg_processors": lambda p: save_pkuseg_processors(p),
("pkuseg_model", lambda p: save_pkuseg_model(p)), }
("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
)
)
return util.to_disk(path, serializers, []) return util.to_disk(path, serializers, [])
def from_disk(self, path, **kwargs): def from_disk(self, path, **kwargs):
@ -261,7 +258,7 @@ class ChineseTokenizer(DummyTokenizer):
try: try:
import pkuseg import pkuseg
except ImportError: except ImportError:
if self.segmenter == "pkuseg": if self.segmenter == Segmenter.pkuseg:
raise ImportError( raise ImportError(
"pkuseg not installed. To use this model, " "pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG + _PKUSEG_INSTALL_MSG
@ -273,9 +270,9 @@ class ChineseTokenizer(DummyTokenizer):
try: try:
import pkuseg import pkuseg
except ImportError: except ImportError:
if self.segmenter == "pkuseg": if self.segmenter == Segmenter.pkuseg:
raise ImportError(self._pkuseg_install_msg) raise ImportError(self._pkuseg_install_msg)
if self.segmenter == "pkuseg": if self.segmenter == Segmenter.pkuseg:
data = srsly.read_msgpack(path) data = srsly.read_msgpack(path)
(user_dict, do_process, common_words, other_words) = data (user_dict, do_process, common_words, other_words) = data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
@ -283,35 +280,64 @@ class ChineseTokenizer(DummyTokenizer):
self.pkuseg_seg.postprocesser.common_words = set(common_words) self.pkuseg_seg.postprocesser.common_words = set(common_words)
self.pkuseg_seg.postprocesser.other_words = set(other_words) self.pkuseg_seg.postprocesser.other_words = set(other_words)
serializers = OrderedDict( serializers = {
( "pkuseg_model": lambda p: load_pkuseg_model(p),
("cfg", lambda p: self._set_config(srsly.read_json(p))), "pkuseg_processors": lambda p: load_pkuseg_processors(p),
("pkuseg_model", lambda p: load_pkuseg_model(p)), }
("pkuseg_processors", lambda p: load_pkuseg_processors(p)),
)
)
util.from_disk(path, serializers, []) util.from_disk(path, serializers, [])
class ChineseDefaults(Language.Defaults): class ChineseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "zh"
tokenizer_exceptions = BASE_EXCEPTIONS tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@classmethod
def create_tokenizer(cls, nlp=None, config={}):
return ChineseTokenizer(cls, nlp, config=config)
class Chinese(Language): class Chinese(Language):
lang = "zh" lang = "zh"
Defaults = ChineseDefaults # override defaults Defaults = ChineseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
def make_doc(self, text):
return self.tokenizer(text) def try_jieba_import(segmenter: str) -> None:
try:
import jieba
if segmenter == Segmenter.jieba:
# segment a short text to have jieba initialize its cache in advance
list(jieba.cut("作为", cut_all=False))
return jieba
except ImportError:
if segmenter == Segmenter.jieba:
msg = (
"Jieba not installed. To use jieba, install it with `pip "
" install jieba` or from https://github.com/fxsjy/jieba"
)
raise ImportError(msg)
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
try:
import pkuseg
if pkuseg_model:
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
elif segmenter == Segmenter.pkuseg:
msg = (
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
"was specified. Please provide the name of a pretrained model "
"or the path to a model with:\n"
'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
"nlp = Chinese.from_config(cfg)"
)
raise ValueError(msg)
except ImportError:
if segmenter == Segmenter.pkuseg:
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg)
except FileNotFoundError:
if segmenter == Segmenter.pkuseg:
msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg)
def _get_pkuseg_trie_data(node, path=""): def _get_pkuseg_trie_data(node, path=""):

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,14 @@
from typing import Optional, Callable, List, Dict
from .lookups import Lookups
from .errors import Errors from .errors import Errors
from .parts_of_speech import NAMES as UPOS_NAMES from .parts_of_speech import NAMES as UPOS_NAMES
from .util import registry, load_language_data, SimpleFrozenDict
@registry.lemmatizers("spacy.Lemmatizer.v1")
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
return Lemmatizer(data_paths=data_paths)
class Lemmatizer: class Lemmatizer:
@ -14,17 +23,27 @@ class Lemmatizer:
def load(cls, *args, **kwargs): def load(cls, *args, **kwargs):
raise NotImplementedError(Errors.E172) raise NotImplementedError(Errors.E172)
def __init__(self, lookups, is_base_form=None): def __init__(
self,
lookups: Optional[Lookups] = None,
data_paths: dict = SimpleFrozenDict(),
is_base_form: Optional[Callable] = None,
) -> None:
"""Initialize a Lemmatizer. """Initialize a Lemmatizer.
lookups (Lookups): The lookups object containing the (optional) tables lookups (Lookups): The lookups object containing the (optional) tables
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
RETURNS (Lemmatizer): The newly constructed object. RETURNS (Lemmatizer): The newly constructed object.
""" """
self.lookups = lookups self.lookups = lookups if lookups is not None else Lookups()
for name, filename in data_paths.items():
data = load_language_data(filename)
self.lookups.add_table(name, data)
self.is_base_form = is_base_form self.is_base_form = is_base_form
def __call__(self, string, univ_pos, morphology=None): def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
"""Lemmatize a string. """Lemmatize a string.
string (str): The string to lemmatize, e.g. the token text. string (str): The string to lemmatize, e.g. the token text.
@ -39,7 +58,6 @@ class Lemmatizer:
if isinstance(univ_pos, int): if isinstance(univ_pos, int):
univ_pos = UPOS_NAMES.get(univ_pos, "X") univ_pos = UPOS_NAMES.get(univ_pos, "X")
univ_pos = univ_pos.lower() univ_pos = univ_pos.lower()
if univ_pos in ("", "eol", "space"): if univ_pos in ("", "eol", "space"):
return [string.lower()] return [string.lower()]
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
@ -67,65 +85,31 @@ class Lemmatizer:
) )
return lemmas return lemmas
def is_base_form(self, univ_pos, morphology=None): def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False
def noun(self, string, morphology=None):
return self(string, "noun", morphology) return self(string, "noun", morphology)
def verb(self, string, morphology=None): def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "verb", morphology) return self(string, "verb", morphology)
def adj(self, string, morphology=None): def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "adj", morphology) return self(string, "adj", morphology)
def det(self, string, morphology=None): def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "det", morphology) return self(string, "det", morphology)
def pron(self, string, morphology=None): def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "pron", morphology) return self(string, "pron", morphology)
def adp(self, string, morphology=None): def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "adp", morphology) return self(string, "adp", morphology)
def num(self, string, morphology=None): def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "num", morphology) return self(string, "num", morphology)
def punct(self, string, morphology=None): def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "punct", morphology) return self(string, "punct", morphology)
def lookup(self, string, orth=None): def lookup(self, string: str, orth: Optional[int] = None) -> str:
"""Look up a lemma in the table, if available. If no lemma is found, """Look up a lemma in the table, if available. If no lemma is found,
the original string is returned. the original string is returned.
@ -141,7 +125,13 @@ class Lemmatizer:
return lookup_table[key] return lookup_table[key]
return string return string
def lemmatize(self, string, index, exceptions, rules): def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
orig = string orig = string
string = string.lower() string = string.lower()
forms = [] forms = []

View File

@ -1,15 +1,32 @@
from typing import Dict, Any, List, Union, Optional
from pathlib import Path
import srsly import srsly
from preshed.bloom import BloomFilter from preshed.bloom import BloomFilter
from collections import OrderedDict from collections import OrderedDict
from .errors import Errors from .errors import Errors
from .util import SimpleFrozenDict, ensure_path from .util import SimpleFrozenDict, ensure_path, registry
from .strings import get_string_id from .strings import get_string_id
UNSET = object() UNSET = object()
@registry.language_data("spacy-lookups-data")
def get_lookups(lang: str) -> Dict[str, Any]:
"""Load the data from the spacy-lookups-data package for a given language,
if available. Returns an empty dict if there's no data or if the package
is not installed.
lang (str): The language code (corresponds to entry point exposed by
the spacy-lookups-data package).
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
"""
if lang in registry.lookups:
return registry.lookups.get(lang)
return {}
class Lookups: class Lookups:
"""Container for large lookup tables and dictionaries, e.g. lemmatization """Container for large lookup tables and dictionaries, e.g. lemmatization
data or tokenizer exception lists. Lookups are available via vocab.lookups, data or tokenizer exception lists. Lookups are available via vocab.lookups,
@ -18,7 +35,7 @@ class Lookups:
via doc.vocab.lookups. via doc.vocab.lookups.
""" """
def __init__(self): def __init__(self) -> None:
"""Initialize the Lookups object. """Initialize the Lookups object.
RETURNS (Lookups): The newly created object. RETURNS (Lookups): The newly created object.
@ -27,7 +44,7 @@ class Lookups:
""" """
self._tables = {} self._tables = {}
def __contains__(self, name): def __contains__(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name. Delegates to """Check if the lookups contain a table of a given name. Delegates to
Lookups.has_table. Lookups.has_table.
@ -36,16 +53,16 @@ class Lookups:
""" """
return self.has_table(name) return self.has_table(name)
def __len__(self): def __len__(self) -> int:
"""RETURNS (int): The number of tables in the lookups.""" """RETURNS (int): The number of tables in the lookups."""
return len(self._tables) return len(self._tables)
@property @property
def tables(self): def tables(self) -> List[str]:
"""RETURNS (list): Names of all tables in the lookups.""" """RETURNS (List[str]): Names of all tables in the lookups."""
return list(self._tables.keys()) return list(self._tables.keys())
def add_table(self, name, data=SimpleFrozenDict()): def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
"""Add a new table to the lookups. Raises an error if the table exists. """Add a new table to the lookups. Raises an error if the table exists.
name (str): Unique name of table. name (str): Unique name of table.
@ -60,12 +77,12 @@ class Lookups:
self._tables[name] = table self._tables[name] = table
return table return table
def get_table(self, name, default=UNSET): def get_table(self, name: str, default: Any = UNSET) -> "Table":
"""Get a table. Raises an error if the table doesn't exist and no """Get a table. Raises an error if the table doesn't exist and no
default value is provided. default value is provided.
name (str): Name of the table. name (str): Name of the table.
default: Optional default value to return if table doesn't exist. default (Any): Optional default value to return if table doesn't exist.
RETURNS (Table): The table. RETURNS (Table): The table.
DOCS: https://spacy.io/api/lookups#get_table DOCS: https://spacy.io/api/lookups#get_table
@ -76,7 +93,7 @@ class Lookups:
return default return default
return self._tables[name] return self._tables[name]
def remove_table(self, name): def remove_table(self, name: str) -> "Table":
"""Remove a table. Raises an error if the table doesn't exist. """Remove a table. Raises an error if the table doesn't exist.
name (str): Name of the table to remove. name (str): Name of the table to remove.
@ -88,7 +105,7 @@ class Lookups:
raise KeyError(Errors.E159.format(name=name, tables=self.tables)) raise KeyError(Errors.E159.format(name=name, tables=self.tables))
return self._tables.pop(name) return self._tables.pop(name)
def has_table(self, name): def has_table(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name. """Check if the lookups contain a table of a given name.
name (str): Name of the table. name (str): Name of the table.
@ -98,7 +115,7 @@ class Lookups:
""" """
return name in self._tables return name in self._tables
def to_bytes(self, **kwargs): def to_bytes(self, **kwargs) -> bytes:
"""Serialize the lookups to a bytestring. """Serialize the lookups to a bytestring.
RETURNS (bytes): The serialized Lookups. RETURNS (bytes): The serialized Lookups.
@ -107,7 +124,7 @@ class Lookups:
""" """
return srsly.msgpack_dumps(self._tables) return srsly.msgpack_dumps(self._tables)
def from_bytes(self, bytes_data, **kwargs): def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
"""Load the lookups from a bytestring. """Load the lookups from a bytestring.
bytes_data (bytes): The data to load. bytes_data (bytes): The data to load.
@ -120,7 +137,9 @@ class Lookups:
self._tables[key] = Table(key, value) self._tables[key] = Table(key, value)
return self return self
def to_disk(self, path, filename="lookups.bin", **kwargs): def to_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
) -> None:
"""Save the lookups to a directory as lookups.bin. Expects a path to a """Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist. directory, which will be created if it doesn't exist.
@ -136,7 +155,9 @@ class Lookups:
with filepath.open("wb") as file_: with filepath.open("wb") as file_:
file_.write(self.to_bytes()) file_.write(self.to_bytes())
def from_disk(self, path, filename="lookups.bin", **kwargs): def from_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
) -> "Lookups":
"""Load lookups from a directory containing a lookups.bin. Will skip """Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist. loading if the file doesn't exist.
@ -162,7 +183,7 @@ class Table(OrderedDict):
""" """
@classmethod @classmethod
def from_dict(cls, data, name=None): def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
"""Initialize a new table from a dict. """Initialize a new table from a dict.
data (dict): The dictionary. data (dict): The dictionary.
@ -175,7 +196,7 @@ class Table(OrderedDict):
self.update(data) self.update(data)
return self return self
def __init__(self, name=None, data=None): def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
"""Initialize a new table. """Initialize a new table.
name (str): Optional table name for reference. name (str): Optional table name for reference.
@ -193,7 +214,7 @@ class Table(OrderedDict):
if data: if data:
self.update(data) self.update(data)
def __setitem__(self, key, value): def __setitem__(self, key: Union[str, int], value: Any) -> None:
"""Set new key/value pair. String keys will be hashed. """Set new key/value pair. String keys will be hashed.
key (str / int): The key to set. key (str / int): The key to set.
@ -203,7 +224,7 @@ class Table(OrderedDict):
OrderedDict.__setitem__(self, key, value) OrderedDict.__setitem__(self, key, value)
self.bloom.add(key) self.bloom.add(key)
def set(self, key, value): def set(self, key: Union[str, int], value: Any) -> None:
"""Set new key/value pair. String keys will be hashed. """Set new key/value pair. String keys will be hashed.
Same as table[key] = value. Same as table[key] = value.
@ -212,7 +233,7 @@ class Table(OrderedDict):
""" """
self[key] = value self[key] = value
def __getitem__(self, key): def __getitem__(self, key: Union[str, int]) -> Any:
"""Get the value for a given key. String keys will be hashed. """Get the value for a given key. String keys will be hashed.
key (str / int): The key to get. key (str / int): The key to get.
@ -221,7 +242,7 @@ class Table(OrderedDict):
key = get_string_id(key) key = get_string_id(key)
return OrderedDict.__getitem__(self, key) return OrderedDict.__getitem__(self, key)
def get(self, key, default=None): def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
"""Get the value for a given key. String keys will be hashed. """Get the value for a given key. String keys will be hashed.
key (str / int): The key to get. key (str / int): The key to get.
@ -231,7 +252,7 @@ class Table(OrderedDict):
key = get_string_id(key) key = get_string_id(key)
return OrderedDict.get(self, key, default) return OrderedDict.get(self, key, default)
def __contains__(self, key): def __contains__(self, key: Union[str, int]) -> bool:
"""Check whether a key is in the table. String keys will be hashed. """Check whether a key is in the table. String keys will be hashed.
key (str / int): The key to check. key (str / int): The key to check.
@ -243,7 +264,7 @@ class Table(OrderedDict):
return False return False
return OrderedDict.__contains__(self, key) return OrderedDict.__contains__(self, key)
def to_bytes(self): def to_bytes(self) -> bytes:
"""Serialize table to a bytestring. """Serialize table to a bytestring.
RETURNS (bytes): The serialized table. RETURNS (bytes): The serialized table.
@ -257,7 +278,7 @@ class Table(OrderedDict):
} }
return srsly.msgpack_dumps(data) return srsly.msgpack_dumps(data)
def from_bytes(self, bytes_data): def from_bytes(self, bytes_data: bytes) -> "Table":
"""Load a table from a bytestring. """Load a table from a bytestring.
bytes_data (bytes): The data to load. bytes_data (bytes): The data to load.

View File

@ -26,8 +26,8 @@ def build_nel_encoder(tok2vec, nO=None):
@registry.assets.register("spacy.KBFromFile.v1") @registry.assets.register("spacy.KBFromFile.v1")
def load_kb(nlp_path, kb_path) -> KnowledgeBase: def load_kb(vocab_path, kb_path) -> KnowledgeBase:
vocab = Vocab().from_disk(Path(nlp_path) / "vocab") vocab = Vocab().from_disk(vocab_path)
kb = KnowledgeBase(vocab=vocab) kb = KnowledgeBase(vocab=vocab)
kb.load_bulk(kb_path) kb.load_bulk(kb_path)
return kb return kb

View File

@ -1,30 +1,9 @@
from thinc.api import ( from typing import Optional
Model, from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
reduce_mean, from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
Linear, from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
list2ragged, from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
Logistic, from thinc.api import Relu, residual, expand_window, FeatureExtractor
ParametricAttention,
)
from thinc.api import chain, concatenate, clone, Dropout
from thinc.api import (
SparseLinear,
Softmax,
softmax_activation,
Maxout,
reduce_sum,
Relu,
residual,
expand_window,
)
from thinc.api import (
HashEmbed,
with_ragged,
with_array,
with_cpu,
uniqued,
FeatureExtractor,
)
from ..spacy_vectors import SpacyVectors from ..spacy_vectors import SpacyVectors
from ... import util from ... import util
@ -34,7 +13,9 @@ from ..extract_ngrams import extract_ngrams
@registry.architectures.register("spacy.TextCatCNN.v1") @registry.architectures.register("spacy.TextCatCNN.v1")
def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None): def build_simple_cnn_text_classifier(
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
) -> Model:
""" """
Build a simple CNN text classifier, given a token-to-vector model as inputs. Build a simple CNN text classifier, given a token-to-vector model as inputs.
If exclusive_classes=True, a softmax non-linearity is applied, so that the If exclusive_classes=True, a softmax non-linearity is applied, so that the
@ -90,13 +71,25 @@ def build_text_classifier(
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10 nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
) )
prefix = HashEmbed( prefix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11 nO=width // 2,
nV=embed_size,
column=cols.index(PREFIX),
dropout=dropout,
seed=11,
) )
suffix = HashEmbed( suffix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12 nO=width // 2,
nV=embed_size,
column=cols.index(SUFFIX),
dropout=dropout,
seed=12,
) )
shape = HashEmbed( shape = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13 nO=width // 2,
nV=embed_size,
column=cols.index(SHAPE),
dropout=dropout,
seed=13,
) )
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])

View File

@ -11,8 +11,8 @@ from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
@registry.architectures.register("spacy.Tok2VecTensors.v1") @registry.architectures.register("spacy.Tok2VecTensors.v1")
def tok2vec_tensors_v1(width): def tok2vec_tensors_v1(width, upstream="*"):
tok2vec = Tok2VecListener("tok2vec", width=width) tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
return tok2vec return tok2vec

View File

@ -1,30 +1,37 @@
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
from wasabi import Printer from wasabi import Printer
import warnings import warnings
from .tokens import Doc, Token, Span from .tokens import Doc, Token, Span
from .errors import Errors, Warnings from .errors import Errors, Warnings
from .util import dot_to_dict
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
def analyze_pipes(pipeline, name, pipe, index, warn=True): def analyze_pipes(
nlp: "Language", name: str, index: int, warn: bool = True
) -> List[str]:
"""Analyze a pipeline component with respect to its position in the current """Analyze a pipeline component with respect to its position in the current
pipeline and the other components. Will check whether requirements are pipeline and the other components. Will check whether requirements are
fulfilled (e.g. if previous components assign the attributes). fulfilled (e.g. if previous components assign the attributes).
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. nlp (Language): The current nlp object.
name (str): The name of the pipeline component to analyze. name (str): The name of the pipeline component to analyze.
pipe (callable): The pipeline component function to analyze.
index (int): The index of the component in the pipeline. index (int): The index of the component in the pipeline.
warn (bool): Show user warning if problem is found. warn (bool): Show user warning if problem is found.
RETURNS (list): The problems found for the given pipeline component. RETURNS (List[str]): The problems found for the given pipeline component.
""" """
assert pipeline[index][0] == name assert nlp.pipeline[index][0] == name
prev_pipes = pipeline[:index] prev_pipes = nlp.pipeline[:index]
pipe_requires = getattr(pipe, "requires", []) meta = nlp.get_pipe_meta(name)
requires = {annot: False for annot in pipe_requires} requires = {annot: False for annot in meta.requires}
if requires: if requires:
for prev_name, prev_pipe in prev_pipes: for prev_name, prev_pipe in prev_pipes:
prev_assigns = getattr(prev_pipe, "assigns", []) prev_meta = nlp.get_pipe_meta(prev_name)
for annot in prev_assigns: for annot in prev_meta.assigns:
requires[annot] = True requires[annot] = True
problems = [] problems = []
for annot, fulfilled in requires.items(): for annot, fulfilled in requires.items():
@ -35,46 +42,29 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
return problems return problems
def analyze_all_pipes(pipeline, warn=True): def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
"""Analyze all pipes in the pipeline in order. """Analyze all pipes in the pipeline in order.
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. nlp (Language): The current nlp object.
warn (bool): Show user warning if problem is found. warn (bool): Show user warning if problem is found.
RETURNS (dict): The problems found, keyed by component name. RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
""" """
problems = {} problems = {}
for i, (name, pipe) in enumerate(pipeline): for i, name in enumerate(nlp.pipe_names):
problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn) problems[name] = analyze_pipes(nlp, name, i, warn=warn)
return problems return problems
def dot_to_dict(values): def validate_attrs(values: Iterable[str]) -> Iterable[str]:
"""Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
become {"token": {"pos": True, "_": {"xyz": True }}}.
values (iterable): The values to convert.
RETURNS (dict): The converted values.
"""
result = {}
for value in values:
path = result
parts = value.lower().split(".")
for i, item in enumerate(parts):
is_last = i == len(parts) - 1
path = path.setdefault(item, True if is_last else {})
return result
def validate_attrs(values):
"""Validate component attributes provided to "assigns", "requires" etc. """Validate component attributes provided to "assigns", "requires" etc.
Raises error for invalid attributes and formatting. Doesn't check if Raises error for invalid attributes and formatting. Doesn't check if
custom extension attributes are registered, since this is something the custom extension attributes are registered, since this is something the
user might want to do themselves later in the component. user might want to do themselves later in the component.
values (iterable): The string attributes to check, e.g. `["token.pos"]`. values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`.
RETURNS (iterable): The checked attributes. RETURNS (Iterable[str]): The checked attributes.
""" """
data = dot_to_dict(values) data = dot_to_dict({value: True for value in values})
objs = {"doc": Doc, "token": Token, "span": Span} objs = {"doc": Doc, "token": Token, "span": Span}
for obj_key, attrs in data.items(): for obj_key, attrs in data.items():
if obj_key == "span": if obj_key == "span":
@ -111,37 +101,40 @@ def validate_attrs(values):
return values return values
def _get_feature_for_attr(pipeline, attr, feature): def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
assert feature in ["assigns", "requires"] assert feature in ["assigns", "requires"]
result = [] result = []
for pipe_name, pipe in pipeline: for pipe_name in nlp.pipe_names:
pipe_assigns = getattr(pipe, feature, []) meta = nlp.get_pipe_meta(pipe_name)
pipe_assigns = getattr(meta, feature, [])
if attr in pipe_assigns: if attr in pipe_assigns:
result.append((pipe_name, pipe)) result.append(pipe_name)
return result return result
def get_assigns_for_attr(pipeline, attr): def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
"""Get all pipeline components that assign an attr, e.g. "doc.tensor". """Get all pipeline components that assign an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. pipeline (Language): The current nlp object.
attr (str): The attribute to check. attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that assign the attr. RETURNS (List[str]): Names of components that require the attr.
""" """
return _get_feature_for_attr(pipeline, attr, "assigns") return _get_feature_for_attr(nlp, attr, "assigns")
def get_requires_for_attr(pipeline, attr): def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
"""Get all pipeline components that require an attr, e.g. "doc.tensor". """Get all pipeline components that require an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. pipeline (Language): The current nlp object.
attr (str): The attribute to check. attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that require the attr. RETURNS (List[str]): Names of components that require the attr.
""" """
return _get_feature_for_attr(pipeline, attr, "requires") return _get_feature_for_attr(nlp, attr, "requires")
def print_summary(nlp, pretty=True, no_print=False): def print_summary(
nlp: "Language", pretty: bool = True, no_print: bool = False
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
"""Print a formatted summary for the current nlp object's pipeline. Shows """Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as a table with the pipeline components and why they assign and require, as
well as any problems if available. well as any problems if available.
@ -154,12 +147,10 @@ def print_summary(nlp, pretty=True, no_print=False):
msg = Printer(pretty=pretty, no_print=no_print) msg = Printer(pretty=pretty, no_print=no_print)
overview = [] overview = []
problems = {} problems = {}
for i, (name, pipe) in enumerate(nlp.pipeline): for i, name in enumerate(nlp.pipe_names):
requires = getattr(pipe, "requires", []) meta = nlp.get_pipe_meta(name)
assigns = getattr(pipe, "assigns", []) overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
retok = getattr(pipe, "retokenizes", False) problems[name] = analyze_pipes(nlp, name, i, warn=False)
overview.append((i, name, requires, assigns, retok))
problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
msg.divider("Pipeline Overview") msg.divider("Pipeline Overview")
header = ("#", "Component", "Requires", "Assigns", "Retokenizes") header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
msg.table(overview, header=header, divider=True, multiline=True) msg.table(overview, header=header, divider=True, multiline=True)
@ -175,15 +166,19 @@ def print_summary(nlp, pretty=True, no_print=False):
return {"overview": overview, "problems": problems} return {"overview": overview, "problems": problems}
def count_pipeline_interdependencies(pipeline): def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
"""Count how many subsequent components require an annotation set by each """Count how many subsequent components require an annotation set by each
component in the pipeline. component in the pipeline.
nlp (Language): The current nlp object.
RETURNS (List[int]): The interdependency counts.
""" """
pipe_assigns = [] pipe_assigns = []
pipe_requires = [] pipe_requires = []
for name, pipe in pipeline: for name in nlp.pipe_names:
pipe_assigns.append(set(getattr(pipe, "assigns", []))) meta = nlp.get_pipe_meta(name)
pipe_requires.append(set(getattr(pipe, "requires", []))) pipe_assigns.append(set(meta.assigns))
pipe_requires.append(set(meta.requires))
counts = [] counts = []
for i, assigns in enumerate(pipe_assigns): for i, assigns in enumerate(pipe_assigns):
count = 0 count = 0

View File

@ -1,28 +1,33 @@
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker from .dep_parser import DependencyParser
from .pipes import TextCategorizer, Pipe, Sentencizer from .entity_linker import EntityLinker
from .pipes import SentenceRecognizer from .ner import EntityRecognizer
from .simple_ner import SimpleNER
from .morphologizer import Morphologizer
from .entityruler import EntityRuler from .entityruler import EntityRuler
from .morphologizer import Morphologizer
from .pipe import Pipe
from spacy.pipeline.senter import SentenceRecognizer
from .sentencizer import Sentencizer
from .simple_ner import SimpleNER
from .tagger import Tagger
from .textcat import TextCategorizer
from .tok2vec import Tok2Vec from .tok2vec import Tok2Vec
from .hooks import SentenceSegmenter, SimilarityHook from .hooks import SentenceSegmenter, SimilarityHook
from .functions import merge_entities, merge_noun_chunks, merge_subtokens from .functions import merge_entities, merge_noun_chunks, merge_subtokens
__all__ = [ __all__ = [
"Tagger",
"DependencyParser", "DependencyParser",
"EntityRecognizer",
"EntityLinker", "EntityLinker",
"TextCategorizer", "EntityRecognizer",
"Tok2Vec",
"Pipe",
"Morphologizer",
"EntityRuler", "EntityRuler",
"Sentencizer", "Morphologizer",
"SentenceSegmenter", "Pipe",
"SentenceRecognizer", "SentenceRecognizer",
"SentenceSegmenter",
"Sentencizer",
"SimilarityHook", "SimilarityHook",
"SimpleNER", "SimpleNER",
"Tagger",
"TextCategorizer",
"Tok2Vec",
"merge_entities", "merge_entities",
"merge_noun_chunks", "merge_noun_chunks",
"merge_subtokens", "merge_subtokens",

View File

@ -1,93 +0,0 @@
from pathlib import Path
from ... import util
def default_nel_config():
loc = Path(__file__).parent / "entity_linker_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_nel():
loc = Path(__file__).parent / "entity_linker_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_morphologizer_config():
loc = Path(__file__).parent / "morphologizer_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_morphologizer():
loc = Path(__file__).parent / "morphologizer_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_parser_config():
loc = Path(__file__).parent / "parser_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_parser():
loc = Path(__file__).parent / "parser_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_ner_config():
loc = Path(__file__).parent / "ner_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_ner():
loc = Path(__file__).parent / "ner_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_senter_config():
loc = Path(__file__).parent / "senter_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_senter():
loc = Path(__file__).parent / "senter_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_tagger_config():
loc = Path(__file__).parent / "tagger_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_tagger():
loc = Path(__file__).parent / "tagger_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_textcat_config():
loc = Path(__file__).parent / "textcat_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_textcat():
loc = Path(__file__).parent / "textcat_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_tok2vec_config():
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_tok2vec():
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_simple_ner_config():
loc = Path(__file__).parent / "simple_ner_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_simple_ner():
loc = Path(__file__).parent / "simple_ner_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]

View File

@ -1,13 +0,0 @@
[model]
@architectures = "spacy.EntityLinker.v1"
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 2
embed_size = 300
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -1,14 +0,0 @@
[model]
@architectures = "spacy.Tagger.v1"
[model.tok2vec]
@architectures = "spacy.HashCharEmbedCNN.v1"
pretrained_vectors = null
width = 128
depth = 4
embed_size = 7000
window_size = 1
maxout_pieces = 3
nM = 64
nC = 8
dropout = null

View File

@ -1,15 +0,0 @@
[model]
@architectures = "spacy.MultiTask.v1"
maxout_pieces = 3
token_vector_width = 96
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 2
subword_features = true
dropout = null

View File

@ -1,16 +0,0 @@
[model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6
hidden_width = 64
maxout_pieces = 2
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

Some files were not shown because too many files have changed in this diff Show More