mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Refactor pipeline components, config and language data (#5759)
* Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
		
							parent
							
								
									311d0bde29
								
							
						
					
					
						commit
						43b960c01b
					
				| 
						 | 
				
			
			@ -17,7 +17,6 @@ import plac
 | 
			
		|||
import random
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
import spacy
 | 
			
		||||
from spacy.kb import KnowledgeBase
 | 
			
		||||
 | 
			
		||||
from spacy.gold import Example
 | 
			
		||||
from spacy.pipeline import EntityRuler
 | 
			
		||||
| 
						 | 
				
			
			@ -82,12 +81,16 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
 | 
			
		|||
 | 
			
		||||
    # Create the Entity Linker component and add it to the pipeline.
 | 
			
		||||
    if "entity_linker" not in nlp.pipe_names:
 | 
			
		||||
        kb = KnowledgeBase(vocab=nlp.vocab)
 | 
			
		||||
        kb.load_bulk(kb_path)
 | 
			
		||||
        print("Loaded Knowledge Base from '%s'" % kb_path)
 | 
			
		||||
 | 
			
		||||
        # use only the predicted EL score and not the prior probability (for demo purposes)
 | 
			
		||||
        cfg = {"kb": kb, "incl_prior": False}
 | 
			
		||||
        print("Loading Knowledge Base from '%s'" % kb_path)
 | 
			
		||||
        cfg = {
 | 
			
		||||
            "kb": {
 | 
			
		||||
                "@assets": "spacy.KBFromFile.v1",
 | 
			
		||||
                "vocab_path": vocab_path,
 | 
			
		||||
                "kb_path": kb_path,
 | 
			
		||||
            },
 | 
			
		||||
            # use only the predicted EL score and not the prior probability (for demo purposes)
 | 
			
		||||
            "incl_prior": False,
 | 
			
		||||
        }
 | 
			
		||||
        entity_linker = nlp.create_pipe("entity_linker", cfg)
 | 
			
		||||
        nlp.add_pipe(entity_linker, last=True)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,7 +6,7 @@ requires = [
 | 
			
		|||
    "cymem>=2.0.2,<2.1.0",
 | 
			
		||||
    "preshed>=3.0.2,<3.1.0",
 | 
			
		||||
    "murmurhash>=0.28.0,<1.1.0",
 | 
			
		||||
    "thinc>=8.0.0a18,<8.0.0a20",
 | 
			
		||||
    "thinc>=8.0.0a19,<8.0.0a30",
 | 
			
		||||
    "blis>=0.4.0,<0.5.0",
 | 
			
		||||
    "pytokenizations"
 | 
			
		||||
]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,11 +1,11 @@
 | 
			
		|||
# Our libraries
 | 
			
		||||
cymem>=2.0.2,<2.1.0
 | 
			
		||||
preshed>=3.0.2,<3.1.0
 | 
			
		||||
thinc>=8.0.0a18,<8.0.0a20
 | 
			
		||||
thinc>=8.0.0a19,<8.0.0a30
 | 
			
		||||
blis>=0.4.0,<0.5.0
 | 
			
		||||
ml_datasets>=0.1.1
 | 
			
		||||
murmurhash>=0.28.0,<1.1.0
 | 
			
		||||
wasabi>=0.7.0,<1.1.0
 | 
			
		||||
wasabi>=0.7.1,<1.1.0
 | 
			
		||||
srsly>=2.1.0,<3.0.0
 | 
			
		||||
catalogue>=0.0.7,<1.1.0
 | 
			
		||||
typer>=0.3.0,<0.4.0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -34,15 +34,15 @@ setup_requires =
 | 
			
		|||
    cymem>=2.0.2,<2.1.0
 | 
			
		||||
    preshed>=3.0.2,<3.1.0
 | 
			
		||||
    murmurhash>=0.28.0,<1.1.0
 | 
			
		||||
    thinc>=8.0.0a18,<8.0.0a20
 | 
			
		||||
    thinc>=8.0.0a19,<8.0.0a30
 | 
			
		||||
install_requires =
 | 
			
		||||
    # Our libraries
 | 
			
		||||
    murmurhash>=0.28.0,<1.1.0
 | 
			
		||||
    cymem>=2.0.2,<2.1.0
 | 
			
		||||
    preshed>=3.0.2,<3.1.0
 | 
			
		||||
    thinc>=8.0.0a18,<8.0.0a20
 | 
			
		||||
    thinc>=8.0.0a19,<8.0.0a30
 | 
			
		||||
    blis>=0.4.0,<0.5.0
 | 
			
		||||
    wasabi>=0.7.0,<1.1.0
 | 
			
		||||
    wasabi>=0.7.1,<1.1.0
 | 
			
		||||
    srsly>=2.1.0,<3.0.0
 | 
			
		||||
    catalogue>=0.0.7,<1.1.0
 | 
			
		||||
    typer>=0.3.0,<0.4.0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										8
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -32,8 +32,14 @@ MOD_NAMES = [
 | 
			
		|||
    "spacy.attrs",
 | 
			
		||||
    "spacy.kb",
 | 
			
		||||
    "spacy.morphology",
 | 
			
		||||
    "spacy.pipeline.pipes",
 | 
			
		||||
    "spacy.pipeline.dep_parser",
 | 
			
		||||
    "spacy.pipeline.morphologizer",
 | 
			
		||||
    "spacy.pipeline.multitask",
 | 
			
		||||
    "spacy.pipeline.ner",
 | 
			
		||||
    "spacy.pipeline.pipe",
 | 
			
		||||
    "spacy.pipeline.sentencizer",
 | 
			
		||||
    "spacy.pipeline.senter",
 | 
			
		||||
    "spacy.pipeline.tagger",
 | 
			
		||||
    "spacy.syntax.stateclass",
 | 
			
		||||
    "spacy.syntax._state",
 | 
			
		||||
    "spacy.tokenizer",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,7 +14,6 @@ from .about import __version__
 | 
			
		|||
from .errors import Errors, Warnings
 | 
			
		||||
from . import util
 | 
			
		||||
from .util import registry
 | 
			
		||||
from .language import component
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if sys.maxunicode == 65535:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -63,7 +63,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
 | 
			
		|||
    result = {}
 | 
			
		||||
    while args:
 | 
			
		||||
        opt = args.pop(0)
 | 
			
		||||
        err = f"Invalid config override '{opt}'"
 | 
			
		||||
        err = f"Invalid CLI argument '{opt}'"
 | 
			
		||||
        if opt.startswith("--"):  # new argument
 | 
			
		||||
            opt = opt.replace("--", "").replace("-", "_")
 | 
			
		||||
            if "." not in opt:
 | 
			
		||||
| 
						 | 
				
			
			@ -73,7 +73,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
 | 
			
		|||
            else:
 | 
			
		||||
                value = args.pop(0)
 | 
			
		||||
            # Just like we do in the config, we're calling json.loads on the
 | 
			
		||||
            # values. But since they come from the CLI, it'd b unintuitive to
 | 
			
		||||
            # values. But since they come from the CLI, it'd be unintuitive to
 | 
			
		||||
            # explicitly mark strings with escaped quotes. So we're working
 | 
			
		||||
            # around that here by falling back to a string if parsing fails.
 | 
			
		||||
            # TODO: improve logic to handle simple types like list of strings?
 | 
			
		||||
| 
						 | 
				
			
			@ -82,7 +82,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
 | 
			
		|||
            except ValueError:
 | 
			
		||||
                result[opt] = str(value)
 | 
			
		||||
        else:
 | 
			
		||||
            msg.fail(f"{err}: options need to start with --", exits=1)
 | 
			
		||||
            msg.fail(f"{err}: override option should start with --", exits=1)
 | 
			
		||||
    return result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,12 +3,12 @@ from pathlib import Path
 | 
			
		|||
from collections import Counter
 | 
			
		||||
import sys
 | 
			
		||||
import srsly
 | 
			
		||||
from wasabi import Printer, MESSAGES, msg
 | 
			
		||||
from wasabi import Printer, MESSAGES, msg, diff_strings
 | 
			
		||||
import typer
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 | 
			
		||||
from ._util import import_code, debug_cli
 | 
			
		||||
from ..schemas import ConfigSchema
 | 
			
		||||
from ..gold import Corpus, Example
 | 
			
		||||
from ..syntax import nonproj
 | 
			
		||||
from ..language import Language
 | 
			
		||||
| 
						 | 
				
			
			@ -33,6 +33,9 @@ def debug_config_cli(
 | 
			
		|||
    ctx: typer.Context,  # This is only used to read additional arguments
 | 
			
		||||
    config_path: Path = Arg(..., help="Path to config file", exists=True),
 | 
			
		||||
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
			
		||||
    output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
 | 
			
		||||
    auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
 | 
			
		||||
    diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
 | 
			
		||||
    # fmt: on
 | 
			
		||||
):
 | 
			
		||||
    """Debug a config.cfg file and show validation errors. The command will
 | 
			
		||||
| 
						 | 
				
			
			@ -40,14 +43,37 @@ def debug_config_cli(
 | 
			
		|||
    validation errors are blocking and will prevent the rest of the config from
 | 
			
		||||
    being resolved. This means that you may not see all validation errors at
 | 
			
		||||
    once and some issues are only shown once previous errors have been fixed.
 | 
			
		||||
    Similar as with the 'train' command, you can override settings from the config
 | 
			
		||||
    as command line options. For instance, --training.batch_size 128 overrides
 | 
			
		||||
    the value of "batch_size" in the block "[training]".
 | 
			
		||||
    """
 | 
			
		||||
    overrides = parse_config_overrides(ctx.args)
 | 
			
		||||
    import_code(code_path)
 | 
			
		||||
    with show_validation_error():
 | 
			
		||||
        util.load_config(
 | 
			
		||||
            config_path, create_objects=False, schema=ConfigSchema, overrides=overrides,
 | 
			
		||||
        )
 | 
			
		||||
    msg.good("Config is valid")
 | 
			
		||||
        config = Config().from_disk(config_path)
 | 
			
		||||
        try:
 | 
			
		||||
            nlp, _ = util.load_model_from_config(
 | 
			
		||||
                config, overrides=overrides, auto_fill=auto_fill
 | 
			
		||||
            )
 | 
			
		||||
        except ValueError as e:
 | 
			
		||||
            msg.fail(str(e), exits=1)
 | 
			
		||||
    is_stdout = output_path is not None and str(output_path) == "-"
 | 
			
		||||
    if auto_fill:
 | 
			
		||||
        orig_config = config.to_str()
 | 
			
		||||
        filled_config = nlp.config.to_str()
 | 
			
		||||
        if orig_config == filled_config:
 | 
			
		||||
            msg.good("Original config is valid, no values were auto-filled")
 | 
			
		||||
        else:
 | 
			
		||||
            msg.good("Auto-filled config is valid")
 | 
			
		||||
            if diff:
 | 
			
		||||
                print(diff_strings(config.to_str(), nlp.config.to_str()))
 | 
			
		||||
    else:
 | 
			
		||||
        msg.good("Original config is valid", show=not is_stdout)
 | 
			
		||||
    if is_stdout:
 | 
			
		||||
        print(nlp.config.to_str())
 | 
			
		||||
    elif output_path is not None:
 | 
			
		||||
        nlp.config.to_disk(output_path)
 | 
			
		||||
        msg.good(f"Saved updated config to {output_path}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@debug_cli.command(
 | 
			
		||||
| 
						 | 
				
			
			@ -117,16 +143,13 @@ def debug_data(
 | 
			
		|||
    if not config_path.exists():
 | 
			
		||||
        msg.fail("Config file not found", config_path, exists=1)
 | 
			
		||||
    with show_validation_error():
 | 
			
		||||
        config = util.load_config(
 | 
			
		||||
            config_path,
 | 
			
		||||
            create_objects=False,
 | 
			
		||||
            schema=ConfigSchema,
 | 
			
		||||
            overrides=config_overrides,
 | 
			
		||||
        )
 | 
			
		||||
    nlp = util.load_model_from_config(config["nlp"])
 | 
			
		||||
        cfg = Config().from_disk(config_path)
 | 
			
		||||
        nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
 | 
			
		||||
    # TODO: handle base model
 | 
			
		||||
    lang = config["nlp"]["lang"]
 | 
			
		||||
    base_model = config["nlp"]["base_model"]
 | 
			
		||||
    pipeline = list(config["nlp"]["pipeline"].keys())
 | 
			
		||||
    base_model = config["training"]["base_model"]
 | 
			
		||||
    pipeline = nlp.pipe_names
 | 
			
		||||
    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
 | 
			
		||||
    tag_map_path = util.ensure_path(config["training"]["tag_map"])
 | 
			
		||||
    tag_map = {}
 | 
			
		||||
    if tag_map_path is not None:
 | 
			
		||||
| 
						 | 
				
			
			@ -164,19 +187,17 @@ def debug_data(
 | 
			
		|||
    msg.good("Corpus is loadable")
 | 
			
		||||
 | 
			
		||||
    # Create all gold data here to avoid iterating over the train_dataset constantly
 | 
			
		||||
    gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
 | 
			
		||||
    gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
 | 
			
		||||
    gold_train_unpreprocessed_data = _compile_gold(
 | 
			
		||||
        train_dataset, pipeline, nlp, make_proj=False
 | 
			
		||||
        train_dataset, factory_names, nlp, make_proj=False
 | 
			
		||||
    )
 | 
			
		||||
    gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
 | 
			
		||||
    gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)
 | 
			
		||||
 | 
			
		||||
    train_texts = gold_train_data["texts"]
 | 
			
		||||
    dev_texts = gold_dev_data["texts"]
 | 
			
		||||
 | 
			
		||||
    msg.divider("Training stats")
 | 
			
		||||
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
 | 
			
		||||
    for pipe in [p for p in pipeline if p not in nlp.factories]:
 | 
			
		||||
        msg.fail(f"Pipeline component '{pipe}' not available in factories")
 | 
			
		||||
    if base_model:
 | 
			
		||||
        msg.text(f"Starting with base model '{base_model}'")
 | 
			
		||||
    else:
 | 
			
		||||
| 
						 | 
				
			
			@ -244,7 +265,7 @@ def debug_data(
 | 
			
		|||
    else:
 | 
			
		||||
        msg.info("No word vectors present in the model")
 | 
			
		||||
 | 
			
		||||
    if "ner" in pipeline:
 | 
			
		||||
    if "ner" in factory_names:
 | 
			
		||||
        # Get all unique NER labels present in the data
 | 
			
		||||
        labels = set(
 | 
			
		||||
            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
 | 
			
		||||
| 
						 | 
				
			
			@ -332,7 +353,7 @@ def debug_data(
 | 
			
		|||
                "with punctuation can not be trained with a noise level > 0."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    if "textcat" in pipeline:
 | 
			
		||||
    if "textcat" in factory_names:
 | 
			
		||||
        msg.divider("Text Classification")
 | 
			
		||||
        labels = [label for label in gold_train_data["cats"]]
 | 
			
		||||
        model_labels = _get_labels_from_model(nlp, "textcat")
 | 
			
		||||
| 
						 | 
				
			
			@ -379,7 +400,7 @@ def debug_data(
 | 
			
		|||
                    "contains only instances with mutually-exclusive classes."
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
    if "tagger" in pipeline:
 | 
			
		||||
    if "tagger" in factory_names:
 | 
			
		||||
        msg.divider("Part-of-speech Tagging")
 | 
			
		||||
        labels = [label for label in gold_train_data["tags"]]
 | 
			
		||||
        tag_map = nlp.vocab.morphology.tag_map
 | 
			
		||||
| 
						 | 
				
			
			@ -394,7 +415,7 @@ def debug_data(
 | 
			
		|||
        for label in non_tagmap:
 | 
			
		||||
            msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
 | 
			
		||||
 | 
			
		||||
    if "parser" in pipeline:
 | 
			
		||||
    if "parser" in factory_names:
 | 
			
		||||
        has_low_data_warning = False
 | 
			
		||||
        msg.divider("Dependency Parsing")
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -541,7 +562,10 @@ def _load_file(file_path: Path, msg: Printer) -> None:
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def _compile_gold(
 | 
			
		||||
    examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
 | 
			
		||||
    examples: Sequence[Example],
 | 
			
		||||
    factory_names: List[str],
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    make_proj: bool,
 | 
			
		||||
) -> Dict[str, Any]:
 | 
			
		||||
    data = {
 | 
			
		||||
        "ner": Counter(),
 | 
			
		||||
| 
						 | 
				
			
			@ -573,7 +597,7 @@ def _compile_gold(
 | 
			
		|||
            for word in valid_words:
 | 
			
		||||
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
 | 
			
		||||
                    data["words_missing_vectors"].update([word])
 | 
			
		||||
        if "ner" in pipeline:
 | 
			
		||||
        if "ner" in factory_names:
 | 
			
		||||
            for i, label in enumerate(eg.get_aligned_ner()):
 | 
			
		||||
                if label is None:
 | 
			
		||||
                    continue
 | 
			
		||||
| 
						 | 
				
			
			@ -595,14 +619,14 @@ def _compile_gold(
 | 
			
		|||
                    data["ner"][combined_label] += 1
 | 
			
		||||
                elif label == "-":
 | 
			
		||||
                    data["ner"]["-"] += 1
 | 
			
		||||
        if "textcat" in pipeline:
 | 
			
		||||
        if "textcat" in factory_names:
 | 
			
		||||
            data["cats"].update(gold.cats)
 | 
			
		||||
            if list(gold.cats.values()).count(1.0) != 1:
 | 
			
		||||
                data["n_cats_multilabel"] += 1
 | 
			
		||||
        if "tagger" in pipeline:
 | 
			
		||||
        if "tagger" in factory_names:
 | 
			
		||||
            tags = eg.get_aligned("TAG", as_string=True)
 | 
			
		||||
            data["tags"].update([x for x in tags if x is not None])
 | 
			
		||||
        if "parser" in pipeline:
 | 
			
		||||
        if "parser" in factory_names:
 | 
			
		||||
            aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
 | 
			
		||||
            data["deps"].update([x for x in aligned_deps if x is not None])
 | 
			
		||||
            for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,8 +1,11 @@
 | 
			
		|||
from typing import Dict, Any, Optional
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from wasabi import msg
 | 
			
		||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
 | 
			
		||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
 | 
			
		||||
from thinc.api import Model
 | 
			
		||||
import typer
 | 
			
		||||
 | 
			
		||||
from ._util import Arg, Opt, debug_cli
 | 
			
		||||
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
 | 
			
		||||
from .. import util
 | 
			
		||||
from ..lang.en import English
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -10,8 +13,10 @@ from ..lang.en import English
 | 
			
		|||
@debug_cli.command("model")
 | 
			
		||||
def debug_model_cli(
 | 
			
		||||
    # fmt: off
 | 
			
		||||
    ctx: typer.Context,  # This is only used to read additional arguments
 | 
			
		||||
    config_path: Path = Arg(..., help="Path to config file", exists=True),
 | 
			
		||||
    layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
 | 
			
		||||
    section: str = Arg(..., help="Section that defines the model to be analysed"),
 | 
			
		||||
    layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
 | 
			
		||||
    dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
 | 
			
		||||
    parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
 | 
			
		||||
    gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
 | 
			
		||||
| 
						 | 
				
			
			@ -20,14 +25,18 @@ def debug_model_cli(
 | 
			
		|||
    P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
 | 
			
		||||
    P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
 | 
			
		||||
    P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
 | 
			
		||||
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
 | 
			
		||||
    seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
 | 
			
		||||
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU")
 | 
			
		||||
    # fmt: on
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
    Analyze a Thinc model implementation. Includes checks for internal structure
 | 
			
		||||
    and activations during training.
 | 
			
		||||
    """
 | 
			
		||||
    if use_gpu >= 0:
 | 
			
		||||
        msg.info("Using GPU")
 | 
			
		||||
        require_gpu(use_gpu)
 | 
			
		||||
    else:
 | 
			
		||||
        msg.info("Using CPU")
 | 
			
		||||
    print_settings = {
 | 
			
		||||
        "dimensions": dimensions,
 | 
			
		||||
        "parameters": parameters,
 | 
			
		||||
| 
						 | 
				
			
			@ -39,27 +48,47 @@ def debug_model_cli(
 | 
			
		|||
        "print_after_training": P2,
 | 
			
		||||
        "print_prediction": P3,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    config_overrides = parse_config_overrides(ctx.args)
 | 
			
		||||
    cfg = Config().from_disk(config_path)
 | 
			
		||||
    with show_validation_error():
 | 
			
		||||
        try:
 | 
			
		||||
            _, config = util.load_model_from_config(cfg, overrides=config_overrides)
 | 
			
		||||
        except ValueError as e:
 | 
			
		||||
            msg.fail(str(e), exits=1)
 | 
			
		||||
    seed = config["pretraining"]["seed"]
 | 
			
		||||
    if seed is not None:
 | 
			
		||||
        msg.info(f"Fixing random seed: {seed}")
 | 
			
		||||
        fix_random_seed(seed)
 | 
			
		||||
    if use_gpu >= 0:
 | 
			
		||||
        msg.info(f"Using GPU: {use_gpu}")
 | 
			
		||||
        require_gpu(use_gpu)
 | 
			
		||||
 | 
			
		||||
    component = config
 | 
			
		||||
    parts = section.split(".")
 | 
			
		||||
    for item in parts:
 | 
			
		||||
        try:
 | 
			
		||||
            component = component[item]
 | 
			
		||||
        except KeyError:
 | 
			
		||||
            msg.fail(
 | 
			
		||||
                f"The section '{section}' is not a valid section in the provided config.",
 | 
			
		||||
                exits=1,
 | 
			
		||||
            )
 | 
			
		||||
    if hasattr(component, "model"):
 | 
			
		||||
        model = component.model
 | 
			
		||||
    else:
 | 
			
		||||
        msg.info(f"Using CPU")
 | 
			
		||||
 | 
			
		||||
    debug_model(
 | 
			
		||||
        config_path, print_settings=print_settings,
 | 
			
		||||
    )
 | 
			
		||||
        msg.fail(
 | 
			
		||||
            f"The section '{section}' does not specify an object that holds a Model.",
 | 
			
		||||
            exits=1,
 | 
			
		||||
        )
 | 
			
		||||
    debug_model(model, print_settings=print_settings)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def debug_model(config_path: Path, *, print_settings=None):
 | 
			
		||||
def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
 | 
			
		||||
    if not isinstance(model, Model):
 | 
			
		||||
        msg.fail(
 | 
			
		||||
            f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
 | 
			
		||||
            exits=1,
 | 
			
		||||
        )
 | 
			
		||||
    if print_settings is None:
 | 
			
		||||
        print_settings = {}
 | 
			
		||||
 | 
			
		||||
    model = util.load_config(config_path, create_objects=True)["model"]
 | 
			
		||||
 | 
			
		||||
    # STEP 0: Printing before training
 | 
			
		||||
    msg.info(f"Analysing model with ID {model.id}")
 | 
			
		||||
    if print_settings.get("print_before_training"):
 | 
			
		||||
| 
						 | 
				
			
			@ -67,7 +96,9 @@ def debug_model(config_path: Path, *, print_settings=None):
 | 
			
		|||
        _print_model(model, print_settings)
 | 
			
		||||
 | 
			
		||||
    # STEP 1: Initializing the model and printing again
 | 
			
		||||
    model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
 | 
			
		||||
    Y = _get_output(model.ops.xp)
 | 
			
		||||
    _set_output_dim(nO=Y.shape[-1], model=model)
 | 
			
		||||
    model.initialize(X=_get_docs(), Y=Y)
 | 
			
		||||
    if print_settings.get("print_after_init"):
 | 
			
		||||
        msg.info(f"After initialization:")
 | 
			
		||||
        _print_model(model, print_settings)
 | 
			
		||||
| 
						 | 
				
			
			@ -110,12 +141,16 @@ def _get_docs():
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def _get_output(xp):
 | 
			
		||||
    return xp.asarray(
 | 
			
		||||
        [
 | 
			
		||||
            xp.asarray([i + 10, i + 20, i + 30], dtype="float32")
 | 
			
		||||
            for i, _ in enumerate(_get_docs())
 | 
			
		||||
        ]
 | 
			
		||||
    )
 | 
			
		||||
    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _set_output_dim(model, nO):
 | 
			
		||||
    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
 | 
			
		||||
    if model.has_dim("nO") is None:
 | 
			
		||||
        model.set_dim("nO", nO)
 | 
			
		||||
    if model.has_ref("output_layer"):
 | 
			
		||||
        if model.get_ref("output_layer").has_dim("nO") is None:
 | 
			
		||||
            model.get_ref("output_layer").set_dim("nO", nO)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _print_model(model, print_settings):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -105,9 +105,10 @@ def evaluate(
 | 
			
		|||
        print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
 | 
			
		||||
 | 
			
		||||
    if displacy_path:
 | 
			
		||||
        factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
 | 
			
		||||
        docs = [ex.predicted for ex in dev_dataset]
 | 
			
		||||
        render_deps = "parser" in nlp.meta.get("pipeline", [])
 | 
			
		||||
        render_ents = "ner" in nlp.meta.get("pipeline", [])
 | 
			
		||||
        render_deps = "parser" in factory_names
 | 
			
		||||
        render_ents = "ner" in factory_names
 | 
			
		||||
        render_parses(
 | 
			
		||||
            docs,
 | 
			
		||||
            displacy_path,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -84,7 +84,6 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
 | 
			
		|||
        msg.fail("Can't find model meta.json", meta_path, exits=1)
 | 
			
		||||
    meta = srsly.read_json(meta_path)
 | 
			
		||||
    if model_path.resolve() != model_path:
 | 
			
		||||
        meta["link"] = str(model_path)
 | 
			
		||||
        meta["source"] = str(model_path.resolve())
 | 
			
		||||
    else:
 | 
			
		||||
        meta["source"] = str(model_path)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -125,7 +125,6 @@ def get_meta(
 | 
			
		|||
    meta.update(existing_meta)
 | 
			
		||||
    nlp = util.load_model_from_path(Path(model_path))
 | 
			
		||||
    meta["spacy_version"] = util.get_model_version_range(about.__version__)
 | 
			
		||||
    meta["pipeline"] = nlp.pipe_names
 | 
			
		||||
    meta["vectors"] = {
 | 
			
		||||
        "width": nlp.vocab.vectors_length,
 | 
			
		||||
        "vectors": len(nlp.vocab.vectors),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,7 +5,7 @@ import time
 | 
			
		|||
import re
 | 
			
		||||
from collections import Counter
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
 | 
			
		||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
 | 
			
		||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
 | 
			
		||||
from thinc.api import CosineDistance, L2Distance
 | 
			
		||||
from wasabi import msg
 | 
			
		||||
| 
						 | 
				
			
			@ -15,7 +15,6 @@ import typer
 | 
			
		|||
 | 
			
		||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
			
		||||
from ._util import import_code
 | 
			
		||||
from ..schemas import ConfigSchema
 | 
			
		||||
from ..errors import Errors
 | 
			
		||||
from ..ml.models.multi_task import build_cloze_multi_task_model
 | 
			
		||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 | 
			
		||||
| 
						 | 
				
			
			@ -37,6 +36,7 @@ def pretrain_cli(
 | 
			
		|||
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
			
		||||
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
 | 
			
		||||
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
 | 
			
		||||
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
 | 
			
		||||
    # fmt: on
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			@ -67,6 +67,7 @@ def pretrain_cli(
 | 
			
		|||
        config_overrides=overrides,
 | 
			
		||||
        resume_path=resume_path,
 | 
			
		||||
        epoch_resume=epoch_resume,
 | 
			
		||||
        use_gpu=use_gpu,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -77,40 +78,29 @@ def pretrain(
 | 
			
		|||
    config_overrides: Dict[str, Any] = {},
 | 
			
		||||
    resume_path: Optional[Path] = None,
 | 
			
		||||
    epoch_resume: Optional[int] = None,
 | 
			
		||||
    use_gpu: int = -1,
 | 
			
		||||
):
 | 
			
		||||
    verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
 | 
			
		||||
    msg.info(f"Loading config from: {config_path}")
 | 
			
		||||
    with show_validation_error():
 | 
			
		||||
        config = util.load_config(
 | 
			
		||||
            config_path,
 | 
			
		||||
            create_objects=False,
 | 
			
		||||
            validate=True,
 | 
			
		||||
            schema=ConfigSchema,
 | 
			
		||||
            overrides=config_overrides,
 | 
			
		||||
        )
 | 
			
		||||
    if not output_dir.exists():
 | 
			
		||||
        output_dir.mkdir()
 | 
			
		||||
        msg.good(f"Created output directory: {output_dir}")
 | 
			
		||||
 | 
			
		||||
    use_gpu = config["training"]["use_gpu"]
 | 
			
		||||
    if use_gpu >= 0:
 | 
			
		||||
        msg.info("Using GPU")
 | 
			
		||||
        require_gpu(use_gpu)
 | 
			
		||||
    else:
 | 
			
		||||
        msg.info("Using CPU")
 | 
			
		||||
 | 
			
		||||
    msg.info(f"Loading config from: {config_path}")
 | 
			
		||||
    config = Config().from_disk(config_path)
 | 
			
		||||
    with show_validation_error():
 | 
			
		||||
        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
 | 
			
		||||
    # TODO: validate that [pretraining] block exists
 | 
			
		||||
    if not output_dir.exists():
 | 
			
		||||
        output_dir.mkdir()
 | 
			
		||||
        msg.good(f"Created output directory: {output_dir}")
 | 
			
		||||
    seed = config["pretraining"]["seed"]
 | 
			
		||||
    if seed is not None:
 | 
			
		||||
        fix_random_seed(seed)
 | 
			
		||||
    if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
 | 
			
		||||
        use_pytorch_for_gpu_memory()
 | 
			
		||||
 | 
			
		||||
    nlp_config = config["nlp"]
 | 
			
		||||
    srsly.write_json(output_dir / "config.json", config)
 | 
			
		||||
    config.to_disk(output_dir / "config.cfg")
 | 
			
		||||
    msg.good("Saved config file in the output directory")
 | 
			
		||||
 | 
			
		||||
    config = util.load_config(config_path, create_objects=True)
 | 
			
		||||
    nlp = util.load_model_from_config(nlp_config)
 | 
			
		||||
    pretrain_config = config["pretraining"]
 | 
			
		||||
 | 
			
		||||
    if texts_loc != "-":  # reading from a file
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -25,7 +25,7 @@ def profile_cli(
 | 
			
		|||
    # fmt: on
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
    Profile a spaCy pipeline, to find out which functions take the most time.
 | 
			
		||||
    Profile which functions take the most time in a spaCy pipeline.
 | 
			
		||||
    Input should be formatted as one JSON object per line with a key "text".
 | 
			
		||||
    It can either be provided as a JSONL file, or be read from sys.sytdin.
 | 
			
		||||
    If no input file is specified, the IMDB dataset is loaded via Thinc.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,4 @@
 | 
			
		|||
from typing import Optional, Dict, Any
 | 
			
		||||
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
 | 
			
		||||
from timeit import default_timer as timer
 | 
			
		||||
import srsly
 | 
			
		||||
import tqdm
 | 
			
		||||
| 
						 | 
				
			
			@ -7,6 +7,7 @@ from wasabi import msg
 | 
			
		|||
import thinc
 | 
			
		||||
import thinc.schedules
 | 
			
		||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
 | 
			
		||||
from thinc.api import Config, Optimizer
 | 
			
		||||
import random
 | 
			
		||||
import typer
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -14,18 +15,15 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
			
		|||
from ._util import import_code
 | 
			
		||||
from ..gold import Corpus, Example
 | 
			
		||||
from ..lookups import Lookups
 | 
			
		||||
from ..language import Language
 | 
			
		||||
from .. import util
 | 
			
		||||
from ..errors import Errors
 | 
			
		||||
from ..schemas import ConfigSchema
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Don't remove - required to load the built-in architectures
 | 
			
		||||
from ..ml import models  # noqa: F401
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
registry = util.registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.command(
 | 
			
		||||
    "train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
 | 
			
		||||
)
 | 
			
		||||
| 
						 | 
				
			
			@ -38,6 +36,8 @@ def train_cli(
 | 
			
		|||
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
 | 
			
		||||
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
			
		||||
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
			
		||||
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
 | 
			
		||||
    resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
 | 
			
		||||
    # fmt: on
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			@ -53,9 +53,7 @@ def train_cli(
 | 
			
		|||
    referenced in the config.
 | 
			
		||||
    """
 | 
			
		||||
    util.set_env_log(verbose)
 | 
			
		||||
    verify_cli_args(
 | 
			
		||||
        train_path=train_path, dev_path=dev_path, config_path=config_path,
 | 
			
		||||
    )
 | 
			
		||||
    verify_cli_args(train_path, dev_path, config_path)
 | 
			
		||||
    overrides = parse_config_overrides(ctx.args)
 | 
			
		||||
    import_code(code_path)
 | 
			
		||||
    train(
 | 
			
		||||
| 
						 | 
				
			
			@ -63,6 +61,8 @@ def train_cli(
 | 
			
		|||
        {"train": train_path, "dev": dev_path},
 | 
			
		||||
        output_path=output_path,
 | 
			
		||||
        config_overrides=overrides,
 | 
			
		||||
        use_gpu=use_gpu,
 | 
			
		||||
        resume_training=resume,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -72,63 +72,53 @@ def train(
 | 
			
		|||
    raw_text: Optional[Path] = None,
 | 
			
		||||
    output_path: Optional[Path] = None,
 | 
			
		||||
    config_overrides: Dict[str, Any] = {},
 | 
			
		||||
    use_gpu: int = -1,
 | 
			
		||||
    resume_training: bool = False,
 | 
			
		||||
) -> None:
 | 
			
		||||
    msg.info(f"Loading config from: {config_path}")
 | 
			
		||||
    # Read the config first without creating objects, to get to the original nlp_config
 | 
			
		||||
    with show_validation_error():
 | 
			
		||||
        config = util.load_config(
 | 
			
		||||
            config_path,
 | 
			
		||||
            create_objects=False,
 | 
			
		||||
            schema=ConfigSchema,
 | 
			
		||||
            overrides=config_overrides,
 | 
			
		||||
        )
 | 
			
		||||
    use_gpu = config["training"]["use_gpu"]
 | 
			
		||||
    if use_gpu >= 0:
 | 
			
		||||
        msg.info(f"Using GPU: {use_gpu}")
 | 
			
		||||
        require_gpu(use_gpu)
 | 
			
		||||
    else:
 | 
			
		||||
        msg.info("Using CPU")
 | 
			
		||||
    msg.info(f"Loading config and nlp from: {config_path}")
 | 
			
		||||
    config = Config().from_disk(config_path)
 | 
			
		||||
    with show_validation_error():
 | 
			
		||||
        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
 | 
			
		||||
    if config["training"]["base_model"]:
 | 
			
		||||
        base_nlp = util.load_model(config["training"]["base_model"])
 | 
			
		||||
        # TODO: do something to check base_nlp against regular nlp described in config?
 | 
			
		||||
        nlp = base_nlp
 | 
			
		||||
    verify_config(nlp)
 | 
			
		||||
    raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
 | 
			
		||||
    if config["training"]["seed"] is not None:
 | 
			
		||||
        fix_random_seed(config["training"]["seed"])
 | 
			
		||||
    if config["training"].get("use_pytorch_for_gpu_memory"):
 | 
			
		||||
    if config["training"]["use_pytorch_for_gpu_memory"]:
 | 
			
		||||
        # It feels kind of weird to not have a default for this.
 | 
			
		||||
        use_pytorch_for_gpu_memory()
 | 
			
		||||
    nlp_config = config["nlp"]
 | 
			
		||||
    config = util.load_config(
 | 
			
		||||
        config_path,
 | 
			
		||||
        create_objects=True,
 | 
			
		||||
        schema=ConfigSchema,
 | 
			
		||||
        overrides=config_overrides,
 | 
			
		||||
    )
 | 
			
		||||
    training = config["training"]
 | 
			
		||||
    msg.info("Creating nlp from config")
 | 
			
		||||
    nlp = util.load_model_from_config(nlp_config)
 | 
			
		||||
    optimizer = training["optimizer"]
 | 
			
		||||
    limit = training["limit"]
 | 
			
		||||
    corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
 | 
			
		||||
    if "textcat" in nlp_config["pipeline"]:
 | 
			
		||||
        verify_textcat_config(nlp, nlp_config)
 | 
			
		||||
    if training.get("resume", False):
 | 
			
		||||
    if resume_training:
 | 
			
		||||
        msg.info("Resuming training")
 | 
			
		||||
        nlp.resume_training()
 | 
			
		||||
    else:
 | 
			
		||||
        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
 | 
			
		||||
        train_examples = list(
 | 
			
		||||
            corpus.train_dataset(
 | 
			
		||||
                nlp,
 | 
			
		||||
                shuffle=False,
 | 
			
		||||
                gold_preproc=training["gold_preproc"],
 | 
			
		||||
                max_length=training["max_length"],
 | 
			
		||||
            )
 | 
			
		||||
        train_examples = corpus.train_dataset(
 | 
			
		||||
            nlp,
 | 
			
		||||
            shuffle=False,
 | 
			
		||||
            gold_preproc=training["gold_preproc"],
 | 
			
		||||
            max_length=training["max_length"],
 | 
			
		||||
        )
 | 
			
		||||
        train_examples = list(train_examples)
 | 
			
		||||
        nlp.begin_training(lambda: train_examples)
 | 
			
		||||
 | 
			
		||||
    # Replace tag map with provided mapping
 | 
			
		||||
    nlp.vocab.morphology.load_tag_map(tag_map)
 | 
			
		||||
 | 
			
		||||
    # Load morph rules
 | 
			
		||||
    nlp.vocab.morphology.load_morph_exceptions(morph_rules)
 | 
			
		||||
    if tag_map:
 | 
			
		||||
        # Replace tag map with provided mapping
 | 
			
		||||
        nlp.vocab.morphology.load_tag_map(tag_map)
 | 
			
		||||
    if morph_rules:
 | 
			
		||||
        # Load morph rules
 | 
			
		||||
        nlp.vocab.morphology.load_morph_exceptions(morph_rules)
 | 
			
		||||
 | 
			
		||||
    # Create empty extra lexeme tables so the data from spacy-lookups-data
 | 
			
		||||
    # isn't loaded if these features are accessed
 | 
			
		||||
| 
						 | 
				
			
			@ -151,9 +141,8 @@ def train(
 | 
			
		|||
        for subpath in tok2vec_path.split("."):
 | 
			
		||||
            tok2vec = tok2vec.get(subpath)
 | 
			
		||||
        if not tok2vec:
 | 
			
		||||
            msg.fail(
 | 
			
		||||
                f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
 | 
			
		||||
            )
 | 
			
		||||
            err = f"Could not locate the tok2vec model at {tok2vec_path}"
 | 
			
		||||
            msg.fail(err, exits=1)
 | 
			
		||||
        tok2vec.from_bytes(weights_data)
 | 
			
		||||
 | 
			
		||||
    msg.info("Loading training corpus")
 | 
			
		||||
| 
						 | 
				
			
			@ -169,12 +158,11 @@ def train(
 | 
			
		|||
        evaluate,
 | 
			
		||||
        dropout=training["dropout"],
 | 
			
		||||
        accumulate_gradient=training["accumulate_gradient"],
 | 
			
		||||
        patience=training.get("patience", 0),
 | 
			
		||||
        max_steps=training.get("max_steps", 0),
 | 
			
		||||
        patience=training["patience"],
 | 
			
		||||
        max_steps=training["max_steps"],
 | 
			
		||||
        eval_frequency=training["eval_frequency"],
 | 
			
		||||
        raw_text=raw_text,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
 | 
			
		||||
    print_row = setup_printer(training, nlp)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -209,8 +197,10 @@ def train(
 | 
			
		|||
            msg.good(f"Saved model to output directory {final_model_path}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def create_train_batches(nlp, corpus, cfg):
 | 
			
		||||
    max_epochs = cfg.get("max_epochs", 0)
 | 
			
		||||
def create_train_batches(
 | 
			
		||||
    nlp: Language, corpus: Corpus, cfg: Union[Config, Dict[str, Any]]
 | 
			
		||||
):
 | 
			
		||||
    max_epochs = cfg["max_epochs"]
 | 
			
		||||
    train_examples = list(
 | 
			
		||||
        corpus.train_dataset(
 | 
			
		||||
            nlp,
 | 
			
		||||
| 
						 | 
				
			
			@ -219,9 +209,8 @@ def create_train_batches(nlp, corpus, cfg):
 | 
			
		|||
            max_length=cfg["max_length"],
 | 
			
		||||
        )
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    epoch = 0
 | 
			
		||||
    batch_strategy = cfg.get("batch_by", "sequences")
 | 
			
		||||
    batch_strategy = cfg["batch_by"]
 | 
			
		||||
    while True:
 | 
			
		||||
        if len(train_examples) == 0:
 | 
			
		||||
            raise ValueError(Errors.E988)
 | 
			
		||||
| 
						 | 
				
			
			@ -241,7 +230,6 @@ def create_train_batches(nlp, corpus, cfg):
 | 
			
		|||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            batches = util.minibatch(train_examples, size=cfg["batch_size"])
 | 
			
		||||
 | 
			
		||||
        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
 | 
			
		||||
        try:
 | 
			
		||||
            first = next(batches)
 | 
			
		||||
| 
						 | 
				
			
			@ -255,18 +243,20 @@ def create_train_batches(nlp, corpus, cfg):
 | 
			
		|||
        random.shuffle(train_examples)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 | 
			
		||||
    def evaluate():
 | 
			
		||||
        dev_examples = list(
 | 
			
		||||
            corpus.dev_dataset(
 | 
			
		||||
                nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
 | 
			
		||||
            )
 | 
			
		||||
def create_evaluation_callback(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    optimizer: Optimizer,
 | 
			
		||||
    corpus: Corpus,
 | 
			
		||||
    cfg: Union[Config, Dict[str, Any]],
 | 
			
		||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
 | 
			
		||||
    def evaluate() -> Tuple[float, Dict[str, float]]:
 | 
			
		||||
        dev_examples = corpus.dev_dataset(
 | 
			
		||||
            nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        dev_examples = list(dev_examples)
 | 
			
		||||
        n_words = sum(len(ex.predicted) for ex in dev_examples)
 | 
			
		||||
        batch_size = cfg.get("evaluation_batch_size", 128)
 | 
			
		||||
        batch_size = cfg["eval_batch_size"]
 | 
			
		||||
        start_time = timer()
 | 
			
		||||
 | 
			
		||||
        if optimizer.averages:
 | 
			
		||||
            with nlp.use_params(optimizer.averages):
 | 
			
		||||
                scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
 | 
			
		||||
| 
						 | 
				
			
			@ -280,12 +270,9 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 | 
			
		|||
        try:
 | 
			
		||||
            weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
 | 
			
		||||
        except KeyError as e:
 | 
			
		||||
            raise KeyError(
 | 
			
		||||
                Errors.E983.format(
 | 
			
		||||
                    dict="score_weights", key=str(e), keys=list(scores.keys())
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            keys = list(scores.keys())
 | 
			
		||||
            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
 | 
			
		||||
            raise KeyError(err)
 | 
			
		||||
        scores["speed"] = wps
 | 
			
		||||
        return weighted_score, scores
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -293,17 +280,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def train_while_improving(
 | 
			
		||||
    nlp,
 | 
			
		||||
    optimizer,
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    optimizer: Optimizer,
 | 
			
		||||
    train_data,
 | 
			
		||||
    evaluate,
 | 
			
		||||
    *,
 | 
			
		||||
    dropout,
 | 
			
		||||
    eval_frequency,
 | 
			
		||||
    accumulate_gradient=1,
 | 
			
		||||
    patience=0,
 | 
			
		||||
    max_steps=0,
 | 
			
		||||
    raw_text=None,
 | 
			
		||||
    dropout: float,
 | 
			
		||||
    eval_frequency: int,
 | 
			
		||||
    accumulate_gradient: int,
 | 
			
		||||
    patience: int,
 | 
			
		||||
    max_steps: int,
 | 
			
		||||
    raw_text: List[Dict[str, str]],
 | 
			
		||||
):
 | 
			
		||||
    """Train until an evaluation stops improving. Works as a generator,
 | 
			
		||||
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
 | 
			
		||||
| 
						 | 
				
			
			@ -414,7 +401,9 @@ def subdivide_batch(batch, accumulate_gradient):
 | 
			
		|||
        yield subbatch
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def setup_printer(training, nlp):
 | 
			
		||||
def setup_printer(
 | 
			
		||||
    training: Union[Dict[str, Any], Config], nlp: Language
 | 
			
		||||
) -> Callable[[Dict[str, Any]], None]:
 | 
			
		||||
    score_cols = training["scores"]
 | 
			
		||||
    score_widths = [max(len(col), 6) for col in score_cols]
 | 
			
		||||
    loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
 | 
			
		||||
| 
						 | 
				
			
			@ -423,11 +412,10 @@ def setup_printer(training, nlp):
 | 
			
		|||
    table_header = [col.upper() for col in table_header]
 | 
			
		||||
    table_widths = [3, 6] + loss_widths + score_widths + [6]
 | 
			
		||||
    table_aligns = ["r" for _ in table_widths]
 | 
			
		||||
 | 
			
		||||
    msg.row(table_header, widths=table_widths)
 | 
			
		||||
    msg.row(["-" * width for width in table_widths])
 | 
			
		||||
 | 
			
		||||
    def print_row(info):
 | 
			
		||||
    def print_row(info: Dict[str, Any]) -> None:
 | 
			
		||||
        try:
 | 
			
		||||
            losses = [
 | 
			
		||||
                "{0:.2f}".format(float(info["losses"][pipe_name]))
 | 
			
		||||
| 
						 | 
				
			
			@ -463,7 +451,9 @@ def setup_printer(training, nlp):
 | 
			
		|||
    return print_row
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def update_meta(training, nlp, info):
 | 
			
		||||
def update_meta(
 | 
			
		||||
    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 | 
			
		||||
) -> None:
 | 
			
		||||
    score_cols = training["scores"]
 | 
			
		||||
    nlp.meta["performance"] = {}
 | 
			
		||||
    for metric in score_cols:
 | 
			
		||||
| 
						 | 
				
			
			@ -472,7 +462,9 @@ def update_meta(training, nlp, info):
 | 
			
		|||
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_from_paths(config):
 | 
			
		||||
def load_from_paths(
 | 
			
		||||
    config: Config,
 | 
			
		||||
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
 | 
			
		||||
    # TODO: separate checks from loading
 | 
			
		||||
    raw_text = util.ensure_path(config["training"]["raw_text"])
 | 
			
		||||
    if raw_text is not None:
 | 
			
		||||
| 
						 | 
				
			
			@ -506,7 +498,7 @@ def verify_cli_args(
 | 
			
		|||
    dev_path: Path,
 | 
			
		||||
    config_path: Path,
 | 
			
		||||
    output_path: Optional[Path] = None,
 | 
			
		||||
):
 | 
			
		||||
) -> None:
 | 
			
		||||
    # Make sure all files and paths exists if they are needed
 | 
			
		||||
    if not config_path or not config_path.exists():
 | 
			
		||||
        msg.fail("Config file not found", config_path, exits=1)
 | 
			
		||||
| 
						 | 
				
			
			@ -528,12 +520,23 @@ def verify_cli_args(
 | 
			
		|||
            )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def verify_textcat_config(nlp, nlp_config):
 | 
			
		||||
def verify_config(nlp: Language) -> None:
 | 
			
		||||
    """Perform additional checks based on the config and loaded nlp object."""
 | 
			
		||||
    # TODO: maybe we should validate based on the actual components, the list
 | 
			
		||||
    # in config["nlp"]["pipeline"] instead?
 | 
			
		||||
    for pipe_config in nlp.config["components"].values():
 | 
			
		||||
        # We can't assume that the component name == the factory
 | 
			
		||||
        factory = pipe_config["@factories"]
 | 
			
		||||
        if factory == "textcat":
 | 
			
		||||
            verify_textcat_config(nlp, pipe_config)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
 | 
			
		||||
    # if 'positive_label' is provided: double check whether it's in the data and
 | 
			
		||||
    # the task is binary
 | 
			
		||||
    if nlp_config["pipeline"]["textcat"].get("positive_label", None):
 | 
			
		||||
    if pipe_config.get("positive_label"):
 | 
			
		||||
        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
 | 
			
		||||
        pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
 | 
			
		||||
        pos_label = pipe_config.get("positive_label")
 | 
			
		||||
        if pos_label not in textcat_labels:
 | 
			
		||||
            msg.fail(
 | 
			
		||||
                f"The textcat's 'positive_label' config setting '{pos_label}' "
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										102
									
								
								spacy/default_config.cfg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										102
									
								
								spacy/default_config.cfg
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,102 @@
 | 
			
		|||
[nlp]
 | 
			
		||||
lang = null
 | 
			
		||||
stop_words = []
 | 
			
		||||
lex_attr_getters = {}
 | 
			
		||||
pipeline = []
 | 
			
		||||
 | 
			
		||||
[nlp.tokenizer]
 | 
			
		||||
@tokenizers = "spacy.Tokenizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.writing_system]
 | 
			
		||||
direction = "ltr"
 | 
			
		||||
has_case = true
 | 
			
		||||
has_letters = true
 | 
			
		||||
 | 
			
		||||
[components]
 | 
			
		||||
 | 
			
		||||
# Training hyper-parameters and additional features.
 | 
			
		||||
[training]
 | 
			
		||||
# Whether to train on sequences with 'gold standard' sentence boundaries
 | 
			
		||||
# and tokens. If you set this to true, take care to ensure your run-time
 | 
			
		||||
# data is passed in sentence-by-sentence via some prior preprocessing.
 | 
			
		||||
gold_preproc = false
 | 
			
		||||
# Limitations on training document length or number of examples.
 | 
			
		||||
max_length = 5000
 | 
			
		||||
limit = 0
 | 
			
		||||
# Data augmentation
 | 
			
		||||
orth_variant_level = 0.0
 | 
			
		||||
dropout = 0.1
 | 
			
		||||
# Controls early-stopping. 0 or -1 mean unlimited.
 | 
			
		||||
patience = 1600
 | 
			
		||||
max_epochs = 0
 | 
			
		||||
max_steps = 20000
 | 
			
		||||
eval_frequency = 200
 | 
			
		||||
eval_batch_size = 128
 | 
			
		||||
# Other settings
 | 
			
		||||
seed = 0
 | 
			
		||||
accumulate_gradient = 1
 | 
			
		||||
use_pytorch_for_gpu_memory = false
 | 
			
		||||
# Control how scores are printed and checkpoints are evaluated.
 | 
			
		||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
 | 
			
		||||
score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
 | 
			
		||||
# These settings are invalid for the transformer models.
 | 
			
		||||
init_tok2vec = null
 | 
			
		||||
discard_oversize = false
 | 
			
		||||
omit_extra_lookups = false
 | 
			
		||||
batch_by = "sequences"
 | 
			
		||||
raw_text = null
 | 
			
		||||
tag_map = null
 | 
			
		||||
morph_rules = null
 | 
			
		||||
base_model = null
 | 
			
		||||
vectors = null
 | 
			
		||||
 | 
			
		||||
[training.batch_size]
 | 
			
		||||
@schedules = "compounding.v1"
 | 
			
		||||
start = 1000
 | 
			
		||||
stop = 1000
 | 
			
		||||
compound = 1.001
 | 
			
		||||
 | 
			
		||||
[training.optimizer]
 | 
			
		||||
@optimizers = "Adam.v1"
 | 
			
		||||
beta1 = 0.9
 | 
			
		||||
beta2 = 0.999
 | 
			
		||||
L2_is_weight_decay = true
 | 
			
		||||
L2 = 0.01
 | 
			
		||||
grad_clip = 1.0
 | 
			
		||||
use_averages = false
 | 
			
		||||
eps = 1e-8
 | 
			
		||||
 | 
			
		||||
[training.optimizer.learn_rate]
 | 
			
		||||
@schedules = "warmup_linear.v1"
 | 
			
		||||
warmup_steps = 250
 | 
			
		||||
total_steps = 20000
 | 
			
		||||
initial_rate = 0.001
 | 
			
		||||
 | 
			
		||||
[pretraining]
 | 
			
		||||
max_epochs = 1000
 | 
			
		||||
min_length = 5
 | 
			
		||||
max_length = 500
 | 
			
		||||
dropout = 0.2
 | 
			
		||||
n_save_every = null
 | 
			
		||||
batch_size = 3000
 | 
			
		||||
seed = ${training:seed}
 | 
			
		||||
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
 | 
			
		||||
tok2vec_model = "components.tok2vec.model"
 | 
			
		||||
 | 
			
		||||
[pretraining.objective]
 | 
			
		||||
type = "characters"
 | 
			
		||||
n_characters = 4
 | 
			
		||||
 | 
			
		||||
[pretraining.optimizer]
 | 
			
		||||
@optimizers = "Adam.v1"
 | 
			
		||||
beta1 = 0.9
 | 
			
		||||
beta2 = 0.999
 | 
			
		||||
L2_is_weight_decay = true
 | 
			
		||||
L2 = 0.01
 | 
			
		||||
grad_clip = 1.0
 | 
			
		||||
use_averages = true
 | 
			
		||||
eps = 1e-8
 | 
			
		||||
learn_rate = 0.001
 | 
			
		||||
							
								
								
									
										108
									
								
								spacy/errors.py
									
									
									
									
									
								
							
							
						
						
									
										108
									
								
								spacy/errors.py
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -124,20 +124,24 @@ class Warnings:
 | 
			
		|||
@add_codes
 | 
			
		||||
class Errors:
 | 
			
		||||
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
 | 
			
		||||
    E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
 | 
			
		||||
            "calls `nlp.create_pipe` with a component name that's not built "
 | 
			
		||||
            "in - for example, when constructing the pipeline from a model's "
 | 
			
		||||
            "meta.json. If you're using a custom component, you can write to "
 | 
			
		||||
            "`Language.factories['{name}']` or remove it from the model meta "
 | 
			
		||||
            "and add it via `nlp.add_pipe` instead.")
 | 
			
		||||
    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
 | 
			
		||||
            "This usually happens when spaCy calls nlp.{method} with custom "
 | 
			
		||||
            "component name that's not registered on the current language class. "
 | 
			
		||||
            "If you're using a custom component, make sure you've added the "
 | 
			
		||||
            "decorator @Language.component (for function components) or "
 | 
			
		||||
            "@Language.factory (for class components).\n\nAvailable "
 | 
			
		||||
            "factories: {opts}")
 | 
			
		||||
    E003 = ("Not a valid pipeline component. Expected callable, but "
 | 
			
		||||
            "got {component} (name: '{name}').")
 | 
			
		||||
    E004 = ("If you meant to add a built-in component, use `create_pipe`: "
 | 
			
		||||
            "`nlp.add_pipe(nlp.create_pipe('{component}'))`")
 | 
			
		||||
            "got {component} (name: '{name}'). If you're using a custom "
 | 
			
		||||
            "component factory, double-check that it correctly returns your "
 | 
			
		||||
            "initialized component.")
 | 
			
		||||
    E004 = ("Can't set up pipeline component: a factory for '{name}' already exists.")
 | 
			
		||||
    E005 = ("Pipeline component '{name}' returned None. If you're using a "
 | 
			
		||||
            "custom component, maybe you forgot to return the processed Doc?")
 | 
			
		||||
    E006 = ("Invalid constraints. You can only set one of the following: "
 | 
			
		||||
            "before, after, first, last.")
 | 
			
		||||
    E006 = ("Invalid constraints for adding pipeline component. You can only "
 | 
			
		||||
            "set one of the following: before (component name or index), "
 | 
			
		||||
            "after (component name or index), first (True) or last (True). "
 | 
			
		||||
            "Invalid configuration: {args}. Existing components: {opts}")
 | 
			
		||||
    E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
 | 
			
		||||
    E008 = ("Some current components would be lost when restoring previous "
 | 
			
		||||
            "pipeline state. If you added components after calling "
 | 
			
		||||
| 
						 | 
				
			
			@ -184,7 +188,7 @@ class Errors:
 | 
			
		|||
            "the documentation:\nhttps://spacy.io/usage/models")
 | 
			
		||||
    E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
 | 
			
		||||
            "component to the pipeline with: "
 | 
			
		||||
            "nlp.add_pipe(nlp.create_pipe('sentencizer')). "
 | 
			
		||||
            "nlp.add_pipe('sentencizer'). "
 | 
			
		||||
            "Alternatively, add the dependency parser, or set sentence "
 | 
			
		||||
            "boundaries by setting doc[i].is_sent_start.")
 | 
			
		||||
    E031 = ("Invalid token: empty string ('') at position {i}.")
 | 
			
		||||
| 
						 | 
				
			
			@ -365,8 +369,6 @@ class Errors:
 | 
			
		|||
    E133 = ("The sum of prior probabilities for alias '{alias}' should not "
 | 
			
		||||
            "exceed 1, but found {sum}.")
 | 
			
		||||
    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
 | 
			
		||||
    E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
 | 
			
		||||
            "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
 | 
			
		||||
    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
 | 
			
		||||
            "to provide a valid JSON object as input with either the `text` "
 | 
			
		||||
            "or `tokens` key. For more info, see the docs:\n"
 | 
			
		||||
| 
						 | 
				
			
			@ -484,6 +486,62 @@ class Errors:
 | 
			
		|||
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
			
		||||
 | 
			
		||||
    # TODO: fix numbering after merging develop into master
 | 
			
		||||
    E956 = ("Can't find component '{name}' in [components] block in the config. "
 | 
			
		||||
            "Available components: {opts}")
 | 
			
		||||
    E957 = ("Writing directly to Language.factories isn't needed anymore in "
 | 
			
		||||
            "spaCy v3. Instead, you can use the @Language.factory decorator "
 | 
			
		||||
            "to register your custom component factory or @Language.component "
 | 
			
		||||
            "to register a simple stateless function component that just takes "
 | 
			
		||||
            "a Doc and returns it.")
 | 
			
		||||
    E958 = ("Language code defined in config ({bad_lang_code}) does not match "
 | 
			
		||||
            "language code of current Language subclass {lang} ({lang_code})")
 | 
			
		||||
    E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
 | 
			
		||||
    E960 = ("No config data found for component '{name}'. This is likely a bug "
 | 
			
		||||
            "in spaCy.")
 | 
			
		||||
    E961 = ("Found non-serializable Python object in config. Configs should "
 | 
			
		||||
            "only include values that can be serialized to JSON. If you need "
 | 
			
		||||
            "to pass models or other objects to your component, use a reference "
 | 
			
		||||
            "to a registered function or initialize the object in your "
 | 
			
		||||
            "component.\n\n{config}")
 | 
			
		||||
    E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
 | 
			
		||||
            "got: {cfg_type}.")
 | 
			
		||||
    E963 = ("Can't read component info from @Language.{decorator} decorator. "
 | 
			
		||||
            "Maybe you forgot to call it? Make sure you're using "
 | 
			
		||||
            "@Language.{decorator}() instead of @Language.{decorator}.")
 | 
			
		||||
    E964 = ("The pipeline component factory for '{name}' needs to have the "
 | 
			
		||||
            "following named arguments, which are passed in by spaCy:\n- nlp: "
 | 
			
		||||
            "receives the current nlp object and lets you access the vocab\n- "
 | 
			
		||||
            "name: the name of the component instance, can be used to identify "
 | 
			
		||||
            "the component, output losses etc.")
 | 
			
		||||
    E965 = ("It looks like you're using the @Language.component decorator to "
 | 
			
		||||
            "register '{name}' on a class instead of a function component. If "
 | 
			
		||||
            "you need to register a class or function that *returns* a component "
 | 
			
		||||
            "function, use the @Language.factory decorator instead.")
 | 
			
		||||
    E966 = ("nlp.add_pipe now takes the string name of the registered component "
 | 
			
		||||
            "factory, not a callable component. Expected string, but got "
 | 
			
		||||
            "{component} (name: '{name}').\n\n- If you created your component "
 | 
			
		||||
            "with nlp.create_pipe('name'): remove nlp.create_pipe and call "
 | 
			
		||||
            "nlp.add_pipe('name') instead.\n\n- If you passed in a component "
 | 
			
		||||
            "like TextCategorizer(): call nlp.add_pipe with the string name "
 | 
			
		||||
            "instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom "
 | 
			
		||||
            "component: Add the decorator @Language.component (for function "
 | 
			
		||||
            "components) or @Language.factory (for class components / factories) "
 | 
			
		||||
            "to your custom component and assign it a name, e.g. "
 | 
			
		||||
            "@Language.component('your_name'). You can then run "
 | 
			
		||||
            "nlp.add_pipe('your_name') to add it to the pipeline.")
 | 
			
		||||
    E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
 | 
			
		||||
    E968 = ("nlp.replace_pipe now takes the string name of the registered component "
 | 
			
		||||
            "factory, not a callable component. Expected string, but got "
 | 
			
		||||
            "{component}.\n\n- If you created your component with"
 | 
			
		||||
            "with nlp.create_pipe('name'): remove nlp.create_pipe and call "
 | 
			
		||||
            "nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a "
 | 
			
		||||
            "component like TextCategorizer(): call nlp.replace_pipe with the "
 | 
			
		||||
            "string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n"
 | 
			
		||||
            "- If you're using a custom component: Add the decorator "
 | 
			
		||||
            "@Language.component (for function components) or @Language.factory "
 | 
			
		||||
            "(for class components / factories) to your custom component and "
 | 
			
		||||
            "assign it a name, e.g. @Language.component('your_name'). You can "
 | 
			
		||||
            "then run nlp.replace_pipe('{name}', 'your_name').")
 | 
			
		||||
    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
 | 
			
		||||
    E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
 | 
			
		||||
    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
 | 
			
		||||
| 
						 | 
				
			
			@ -506,10 +564,12 @@ class Errors:
 | 
			
		|||
            "into {values}, but found {value}.")
 | 
			
		||||
    E983 = ("Invalid key for '{dict}': {key}. Available keys: "
 | 
			
		||||
            "{keys}")
 | 
			
		||||
    E985 = ("The pipeline component '{component}' is already available in the base "
 | 
			
		||||
            "model. The settings in the component block in the config file are "
 | 
			
		||||
            "being ignored. If you want to replace this component instead, set "
 | 
			
		||||
            "'replace' to True in the training configuration.")
 | 
			
		||||
    E984 = ("Invalid component config for '{name}': no @factories key "
 | 
			
		||||
            "specifying the registered function used to initialize the "
 | 
			
		||||
            "component. For example, @factories = \"ner\" will use the 'ner' "
 | 
			
		||||
            "factory and all other settings in the block will be passed "
 | 
			
		||||
            "to it as arguments.\n\n{config}")
 | 
			
		||||
    E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
 | 
			
		||||
    E986 = ("Could not create any training batches: check your input. "
 | 
			
		||||
            "Perhaps discard_oversize should be set to False ?")
 | 
			
		||||
    E987 = ("The text of an example training instance is either a Doc or "
 | 
			
		||||
| 
						 | 
				
			
			@ -530,9 +590,9 @@ class Errors:
 | 
			
		|||
    E992 = ("The function `select_pipes` was called with `enable`={enable} "
 | 
			
		||||
            "and `disable`={disable} but that information is conflicting "
 | 
			
		||||
            "for the `nlp` pipeline with components {names}.")
 | 
			
		||||
    E993 = ("The config for 'nlp' should include either a key 'name' to "
 | 
			
		||||
            "refer to an existing model by name or path, or a key 'lang' "
 | 
			
		||||
            "to create a new blank model.")
 | 
			
		||||
    E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
 | 
			
		||||
            "the code of the language to initialize it with (for example "
 | 
			
		||||
            "'en' for English).\n\n{config}")
 | 
			
		||||
    E996 = ("Could not parse {file}: {msg}")
 | 
			
		||||
    E997 = ("Tokenizer special cases are not allowed to modify the text. "
 | 
			
		||||
            "This would map '{chunk}' to '{orth}' given token attributes "
 | 
			
		||||
| 
						 | 
				
			
			@ -540,9 +600,9 @@ class Errors:
 | 
			
		|||
    E999 = ("Unable to merge the `Doc` objects because they do not all share "
 | 
			
		||||
            "the same `Vocab`.")
 | 
			
		||||
    E1000 = ("No pkuseg model available. Provide a pkuseg model when "
 | 
			
		||||
            "initializing the pipeline: "
 | 
			
		||||
            '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
 | 
			
		||||
            'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
 | 
			
		||||
             "initializing the pipeline:\n"
 | 
			
		||||
             'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
 | 
			
		||||
             'nlp = Chinese(config=cfg)')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@add_codes
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,10 +1,9 @@
 | 
			
		|||
import re
 | 
			
		||||
 | 
			
		||||
from .conll_ner2docs import n_sents_info
 | 
			
		||||
from ...gold import Example
 | 
			
		||||
from ...gold import iob_to_biluo, spans_from_biluo_tags
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...tokens import Doc, Token, Span
 | 
			
		||||
from ...vocab import Vocab
 | 
			
		||||
from wasabi import Printer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -73,7 +72,7 @@ def read_conllx(
 | 
			
		|||
    ner_map=None,
 | 
			
		||||
):
 | 
			
		||||
    """ Yield docs, one for each sentence """
 | 
			
		||||
    vocab = Language.Defaults.create_vocab()  # need vocab to make a minimal Doc
 | 
			
		||||
    vocab = Vocab()  # need vocab to make a minimal Doc
 | 
			
		||||
    for sent in input_data.strip().split("\n\n"):
 | 
			
		||||
        lines = sent.strip().split("\n")
 | 
			
		||||
        if lines:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AfrikaansDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "af"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "af"
 | 
			
		||||
stop_words = {"@language_data": "spacy.af.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.af.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Afrikaans(Language):
 | 
			
		||||
    lang = "af"
 | 
			
		||||
    Defaults = AfrikaansDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Afrikaans"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,31 +1,48 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ar"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ar.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.writing_system]
 | 
			
		||||
direction = "rtl"
 | 
			
		||||
has_case = false
 | 
			
		||||
has_letters = true
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ar.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ar.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArabicDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "ar"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Arabic(Language):
 | 
			
		||||
    lang = "ar"
 | 
			
		||||
    Defaults = ArabicDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Arabic"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BulgarianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "bg"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "bg"
 | 
			
		||||
stop_words = {"@language_data": "spacy.bg.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.bg.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Bulgarian(Language):
 | 
			
		||||
    lang = "bg"
 | 
			
		||||
    Defaults = BulgarianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Bulgarian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,18 +1,35 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "bn"
 | 
			
		||||
stop_words = {"@language_data": "spacy.bn.stop_words"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.bn.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BengaliDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "bn"
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
| 
						 | 
				
			
			@ -21,6 +38,7 @@ class BengaliDefaults(Language.Defaults):
 | 
			
		|||
class Bengali(Language):
 | 
			
		||||
    lang = "bn"
 | 
			
		||||
    Defaults = BengaliDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Bengali"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,31 +1,49 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ca"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ca.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ca.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ca.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CatalanDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "ca"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Catalan(Language):
 | 
			
		||||
    lang = "ca"
 | 
			
		||||
    Defaults = CatalanDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Catalan"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CzechDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "cs"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "cs"
 | 
			
		||||
stop_words = {"@language_data": "spacy.cs.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.cs.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Czech(Language):
 | 
			
		||||
    lang = "cs"
 | 
			
		||||
    Defaults = CzechDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Czech"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,27 +1,50 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "da"
 | 
			
		||||
stop_words = {"@language_data": "spacy.da.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.da.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.da.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DanishDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "da"
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Danish(Language):
 | 
			
		||||
    lang = "da"
 | 
			
		||||
    Defaults = DanishDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Danish"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,23 +1,40 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "de"
 | 
			
		||||
stop_words = {"@language_data": "spacy.de.stop_words"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.de.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class GermanDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "de"
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
    single_orth_variants = [
 | 
			
		||||
        {"tags": ["$("], "variants": ["…", "..."]},
 | 
			
		||||
| 
						 | 
				
			
			@ -38,6 +55,7 @@ class GermanDefaults(Language.Defaults):
 | 
			
		|||
class German(Language):
 | 
			
		||||
    lang = "de"
 | 
			
		||||
    Defaults = GermanDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["German"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,6 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
| 
						 | 
				
			
			@ -6,32 +9,51 @@ from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "el"
 | 
			
		||||
stop_words = {"@language_data": "spacy.el.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.GreekLemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
 | 
			
		||||
def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
 | 
			
		||||
    return GreekLemmatizer(data_paths=data_paths)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.el.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.el.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class GreekDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "el"
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
			
		||||
        if lookups is None:
 | 
			
		||||
            lookups = Lookups()
 | 
			
		||||
        return GreekLemmatizer(lookups)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Greek(Language):
 | 
			
		||||
    lang = "el"
 | 
			
		||||
    Defaults = GreekDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Greek"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,5 @@
 | 
			
		|||
from typing import Dict, List
 | 
			
		||||
 | 
			
		||||
from ...lemmatizer import Lemmatizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -12,7 +14,13 @@ class GreekLemmatizer(Lemmatizer):
 | 
			
		|||
    not applicable for Greek language.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def lemmatize(self, string, index, exceptions, rules):
 | 
			
		||||
    def lemmatize(
 | 
			
		||||
        self,
 | 
			
		||||
        string: str,
 | 
			
		||||
        index: Dict[str, List[str]],
 | 
			
		||||
        exceptions: Dict[str, Dict[str, List[str]]],
 | 
			
		||||
        rules: Dict[str, List[List[str]]],
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        string = string.lower()
 | 
			
		||||
        forms = []
 | 
			
		||||
        if string in index:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,25 +1,50 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
from .lemmatizer import is_base_form
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...lemmatizer import Lemmatizer
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _return_en(_):
 | 
			
		||||
    return "en"
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "en"
 | 
			
		||||
stop_words = {"@language_data": "spacy.en.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.EnglishLemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.en.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.en.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
 | 
			
		||||
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
 | 
			
		||||
    return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EnglishDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = _return_en
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    single_orth_variants = [
 | 
			
		||||
| 
						 | 
				
			
			@ -31,45 +56,11 @@ class EnglishDefaults(Language.Defaults):
 | 
			
		|||
        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def is_base_form(cls, univ_pos, morphology=None):
 | 
			
		||||
        """
 | 
			
		||||
        Check whether we're dealing with an uninflected paradigm, so we can
 | 
			
		||||
        avoid lemmatization entirely.
 | 
			
		||||
 | 
			
		||||
        univ_pos (unicode / int): The token's universal part-of-speech tag.
 | 
			
		||||
        morphology (dict): The token's morphological features following the
 | 
			
		||||
            Universal Dependencies scheme.
 | 
			
		||||
        """
 | 
			
		||||
        if morphology is None:
 | 
			
		||||
            morphology = {}
 | 
			
		||||
        if univ_pos == "noun" and morphology.get("Number") == "sing":
 | 
			
		||||
            return True
 | 
			
		||||
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
 | 
			
		||||
            return True
 | 
			
		||||
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
 | 
			
		||||
        # morphology
 | 
			
		||||
        elif univ_pos == "verb" and (
 | 
			
		||||
            morphology.get("VerbForm") == "fin"
 | 
			
		||||
            and morphology.get("Tense") == "pres"
 | 
			
		||||
            and morphology.get("Number") is None
 | 
			
		||||
        ):
 | 
			
		||||
            return True
 | 
			
		||||
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
 | 
			
		||||
            return True
 | 
			
		||||
        elif morphology.get("VerbForm") == "inf":
 | 
			
		||||
            return True
 | 
			
		||||
        elif morphology.get("VerbForm") == "none":
 | 
			
		||||
            return True
 | 
			
		||||
        elif morphology.get("Degree") == "pos":
 | 
			
		||||
            return True
 | 
			
		||||
        else:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class English(Language):
 | 
			
		||||
    lang = "en"
 | 
			
		||||
    Defaults = EnglishDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["English"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										36
									
								
								spacy/lang/en/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								spacy/lang/en/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,36 @@
 | 
			
		|||
from typing import Optional
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
 | 
			
		||||
    """
 | 
			
		||||
    Check whether we're dealing with an uninflected paradigm, so we can
 | 
			
		||||
    avoid lemmatization entirely.
 | 
			
		||||
 | 
			
		||||
    univ_pos (unicode / int): The token's universal part-of-speech tag.
 | 
			
		||||
    morphology (dict): The token's morphological features following the
 | 
			
		||||
        Universal Dependencies scheme.
 | 
			
		||||
    """
 | 
			
		||||
    if morphology is None:
 | 
			
		||||
        morphology = {}
 | 
			
		||||
    if univ_pos == "noun" and morphology.get("Number") == "sing":
 | 
			
		||||
        return True
 | 
			
		||||
    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
 | 
			
		||||
        return True
 | 
			
		||||
    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
 | 
			
		||||
    # morphology
 | 
			
		||||
    elif univ_pos == "verb" and (
 | 
			
		||||
        morphology.get("VerbForm") == "fin"
 | 
			
		||||
        and morphology.get("Tense") == "pres"
 | 
			
		||||
        and morphology.get("Number") is None
 | 
			
		||||
    ):
 | 
			
		||||
        return True
 | 
			
		||||
    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
 | 
			
		||||
        return True
 | 
			
		||||
    elif morphology.get("VerbForm") == "inf":
 | 
			
		||||
        return True
 | 
			
		||||
    elif morphology.get("VerbForm") == "none":
 | 
			
		||||
        return True
 | 
			
		||||
    elif morphology.get("Degree") == "pos":
 | 
			
		||||
        return True
 | 
			
		||||
    else:
 | 
			
		||||
        return False
 | 
			
		||||
| 
						 | 
				
			
			@ -1,47 +1,17 @@
 | 
			
		|||
from ...attrs import LIKE_NUM
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# fmt: off
 | 
			
		||||
_num_words = [
 | 
			
		||||
    "zero",
 | 
			
		||||
    "one",
 | 
			
		||||
    "two",
 | 
			
		||||
    "three",
 | 
			
		||||
    "four",
 | 
			
		||||
    "five",
 | 
			
		||||
    "six",
 | 
			
		||||
    "seven",
 | 
			
		||||
    "eight",
 | 
			
		||||
    "nine",
 | 
			
		||||
    "ten",
 | 
			
		||||
    "eleven",
 | 
			
		||||
    "twelve",
 | 
			
		||||
    "thirteen",
 | 
			
		||||
    "fourteen",
 | 
			
		||||
    "fifteen",
 | 
			
		||||
    "sixteen",
 | 
			
		||||
    "seventeen",
 | 
			
		||||
    "eighteen",
 | 
			
		||||
    "nineteen",
 | 
			
		||||
    "twenty",
 | 
			
		||||
    "thirty",
 | 
			
		||||
    "forty",
 | 
			
		||||
    "fifty",
 | 
			
		||||
    "sixty",
 | 
			
		||||
    "seventy",
 | 
			
		||||
    "eighty",
 | 
			
		||||
    "ninety",
 | 
			
		||||
    "hundred",
 | 
			
		||||
    "thousand",
 | 
			
		||||
    "million",
 | 
			
		||||
    "billion",
 | 
			
		||||
    "trillion",
 | 
			
		||||
    "quadrillion",
 | 
			
		||||
    "gajillion",
 | 
			
		||||
    "bazillion",
 | 
			
		||||
    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
 | 
			
		||||
    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
 | 
			
		||||
    "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
 | 
			
		||||
    "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
 | 
			
		||||
    "million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
 | 
			
		||||
]
 | 
			
		||||
# fmt: on
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def like_num(text):
 | 
			
		||||
def like_num(text: str) -> bool:
 | 
			
		||||
    if text.startswith(("+", "-", "±", "~")):
 | 
			
		||||
        text = text[1:]
 | 
			
		||||
    text = text.replace(",", "").replace(".", "")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,33 +1,52 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.config import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "es"
 | 
			
		||||
stop_words = {"@language_data": "spacy.es.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.es.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.es.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SpanishDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "es"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Spanish(Language):
 | 
			
		||||
    lang = "es"
 | 
			
		||||
    Defaults = SpanishDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Spanish"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EstonianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "et"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "et"
 | 
			
		||||
stop_words = {"@language_data": "spacy.et.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.et.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Estonian(Language):
 | 
			
		||||
    lang = "et"
 | 
			
		||||
    Defaults = EstonianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Estonian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,25 +1,41 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "eu"
 | 
			
		||||
stop_words = {"@language_data": "spacy.eu.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.eu.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.eu.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BasqueDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "eu"
 | 
			
		||||
 | 
			
		||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Basque(Language):
 | 
			
		||||
    lang = "eu"
 | 
			
		||||
    Defaults = BasqueDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Basque"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,8 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
| 
						 | 
				
			
			@ -9,23 +10,46 @@ from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		|||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "fa"
 | 
			
		||||
stop_words = {"@language_data": "spacy.fa.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.writing_system]
 | 
			
		||||
direction = "rtl"
 | 
			
		||||
has_case = false
 | 
			
		||||
has_letters = true
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.fa.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.fa.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PersianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "fa"
 | 
			
		||||
    tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Persian(Language):
 | 
			
		||||
    lang = "fa"
 | 
			
		||||
    Defaults = PersianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Persian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,31 +1,43 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "fi"
 | 
			
		||||
stop_words = {"@language_data": "spacy.fi.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.fi.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.fi.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FinnishDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "fi"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Finnish(Language):
 | 
			
		||||
    lang = "fi"
 | 
			
		||||
    Defaults = FinnishDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Finnish"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,44 +1,61 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 | 
			
		||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .lemmatizer import FrenchLemmatizer
 | 
			
		||||
from .lemmatizer import FrenchLemmatizer, is_base_form
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "fr"
 | 
			
		||||
stop_words = {"@language_data": "spacy.fr.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.FrenchLemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
 | 
			
		||||
def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
 | 
			
		||||
    return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.fr.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.fr.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FrenchDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "fr"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    token_match = TOKEN_MATCH
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
			
		||||
        if lookups is None:
 | 
			
		||||
            lookups = Lookups()
 | 
			
		||||
        return FrenchLemmatizer(lookups)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class French(Language):
 | 
			
		||||
    lang = "fr"
 | 
			
		||||
    Defaults = FrenchDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["French"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,5 @@
 | 
			
		|||
from typing import Optional, List, Dict
 | 
			
		||||
 | 
			
		||||
from ...lemmatizer import Lemmatizer
 | 
			
		||||
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
 | 
			
		||||
from ...symbols import SCONJ, CCONJ
 | 
			
		||||
| 
						 | 
				
			
			@ -13,7 +15,9 @@ class FrenchLemmatizer(Lemmatizer):
 | 
			
		|||
    the lookup table.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
			
		||||
    def __call__(
 | 
			
		||||
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
			
		||||
        if "lemma_rules" not in self.lookups:
 | 
			
		||||
            return [lookup_table.get(string, string)]
 | 
			
		||||
| 
						 | 
				
			
			@ -52,62 +56,19 @@ class FrenchLemmatizer(Lemmatizer):
 | 
			
		|||
        )
 | 
			
		||||
        return lemmas
 | 
			
		||||
 | 
			
		||||
    def is_base_form(self, univ_pos, morphology=None):
 | 
			
		||||
        """
 | 
			
		||||
        Check whether we're dealing with an uninflected paradigm, so we can
 | 
			
		||||
        avoid lemmatization entirely.
 | 
			
		||||
        """
 | 
			
		||||
        morphology = {} if morphology is None else morphology
 | 
			
		||||
        others = [
 | 
			
		||||
            key
 | 
			
		||||
            for key in morphology
 | 
			
		||||
            if key not in (POS, "Number", "POS", "VerbForm", "Tense")
 | 
			
		||||
        ]
 | 
			
		||||
        if univ_pos == "noun" and morphology.get("Number") == "sing":
 | 
			
		||||
            return True
 | 
			
		||||
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
 | 
			
		||||
            return True
 | 
			
		||||
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
 | 
			
		||||
        # morphology
 | 
			
		||||
        elif univ_pos == "verb" and (
 | 
			
		||||
            morphology.get("VerbForm") == "fin"
 | 
			
		||||
            and morphology.get("Tense") == "pres"
 | 
			
		||||
            and morphology.get("Number") is None
 | 
			
		||||
            and not others
 | 
			
		||||
        ):
 | 
			
		||||
            return True
 | 
			
		||||
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
 | 
			
		||||
            return True
 | 
			
		||||
        elif "VerbForm=inf" in morphology:
 | 
			
		||||
            return True
 | 
			
		||||
        elif "VerbForm=none" in morphology:
 | 
			
		||||
            return True
 | 
			
		||||
        elif "Number=sing" in morphology:
 | 
			
		||||
            return True
 | 
			
		||||
        elif "Degree=pos" in morphology:
 | 
			
		||||
            return True
 | 
			
		||||
        else:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def noun(self, string, morphology=None):
 | 
			
		||||
        return self(string, "noun", morphology)
 | 
			
		||||
 | 
			
		||||
    def verb(self, string, morphology=None):
 | 
			
		||||
        return self(string, "verb", morphology)
 | 
			
		||||
 | 
			
		||||
    def adj(self, string, morphology=None):
 | 
			
		||||
        return self(string, "adj", morphology)
 | 
			
		||||
 | 
			
		||||
    def punct(self, string, morphology=None):
 | 
			
		||||
        return self(string, "punct", morphology)
 | 
			
		||||
 | 
			
		||||
    def lookup(self, string, orth=None):
 | 
			
		||||
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
			
		||||
        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
			
		||||
        if orth is not None and orth in lookup_table:
 | 
			
		||||
            return lookup_table[orth][0]
 | 
			
		||||
        return string
 | 
			
		||||
 | 
			
		||||
    def lemmatize(self, string, index, exceptions, rules):
 | 
			
		||||
    def lemmatize(
 | 
			
		||||
        self,
 | 
			
		||||
        string: str,
 | 
			
		||||
        index: Dict[str, List[str]],
 | 
			
		||||
        exceptions: Dict[str, Dict[str, List[str]]],
 | 
			
		||||
        rules: Dict[str, List[List[str]]],
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
			
		||||
        string = string.lower()
 | 
			
		||||
        forms = []
 | 
			
		||||
| 
						 | 
				
			
			@ -133,3 +94,41 @@ class FrenchLemmatizer(Lemmatizer):
 | 
			
		|||
        if not forms:
 | 
			
		||||
            forms.append(string)
 | 
			
		||||
        return list(set(forms))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
 | 
			
		||||
    """
 | 
			
		||||
    Check whether we're dealing with an uninflected paradigm, so we can
 | 
			
		||||
    avoid lemmatization entirely.
 | 
			
		||||
    """
 | 
			
		||||
    morphology = {} if morphology is None else morphology
 | 
			
		||||
    others = [
 | 
			
		||||
        key
 | 
			
		||||
        for key in morphology
 | 
			
		||||
        if key not in (POS, "Number", "POS", "VerbForm", "Tense")
 | 
			
		||||
    ]
 | 
			
		||||
    if univ_pos == "noun" and morphology.get("Number") == "sing":
 | 
			
		||||
        return True
 | 
			
		||||
    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
 | 
			
		||||
        return True
 | 
			
		||||
    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
 | 
			
		||||
    # morphology
 | 
			
		||||
    elif univ_pos == "verb" and (
 | 
			
		||||
        morphology.get("VerbForm") == "fin"
 | 
			
		||||
        and morphology.get("Tense") == "pres"
 | 
			
		||||
        and morphology.get("Number") is None
 | 
			
		||||
        and not others
 | 
			
		||||
    ):
 | 
			
		||||
        return True
 | 
			
		||||
    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
 | 
			
		||||
        return True
 | 
			
		||||
    elif "VerbForm=inf" in morphology:
 | 
			
		||||
        return True
 | 
			
		||||
    elif "VerbForm=none" in morphology:
 | 
			
		||||
        return True
 | 
			
		||||
    elif "Number=sing" in morphology:
 | 
			
		||||
        return True
 | 
			
		||||
    elif "Degree=pos" in morphology:
 | 
			
		||||
        return True
 | 
			
		||||
    else:
 | 
			
		||||
        return False
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,23 +1,33 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ga"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ga.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ga.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class IrishDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "ga"
 | 
			
		||||
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = set(STOP_WORDS)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Irish(Language):
 | 
			
		||||
    lang = "ga"
 | 
			
		||||
    Defaults = IrishDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Irish"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,15 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class GujaratiDefaults(Language.Defaults):
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "gu"
 | 
			
		||||
stop_words = {"@language_data": "spacy.gu.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.gu.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Gujarati(Language):
 | 
			
		||||
    lang = "gu"
 | 
			
		||||
    Defaults = GujaratiDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Gujarati"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,22 +1,37 @@
 | 
			
		|||
from .stop_words import STOP_WORDS
 | 
			
		||||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "he"
 | 
			
		||||
stop_words = {"@language_data": "spacy.he.stop_words"}
 | 
			
		||||
 | 
			
		||||
[nlp.writing_system]
 | 
			
		||||
direction = "rtl"
 | 
			
		||||
has_case = false
 | 
			
		||||
has_letters = true
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.he.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HebrewDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "he"
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Hebrew(Language):
 | 
			
		||||
    lang = "he"
 | 
			
		||||
    Defaults = HebrewDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Hebrew"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,20 +1,33 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HindiDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "hi"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "hi"
 | 
			
		||||
stop_words = {"@language_data": "spacy.hi.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.hi.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.hi.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Hindi(Language):
 | 
			
		||||
    lang = "hi"
 | 
			
		||||
    Defaults = HindiDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Hindi"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,25 +1,39 @@
 | 
			
		|||
from .stop_words import STOP_WORDS
 | 
			
		||||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "hr"
 | 
			
		||||
stop_words = {"@language_data": "spacy.hr.stop_words"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.hr.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CroatianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "hr"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Croatian(Language):
 | 
			
		||||
    lang = "hr"
 | 
			
		||||
    Defaults = CroatianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Croatian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,22 +1,35 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 | 
			
		||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "hu"
 | 
			
		||||
stop_words = {"@language_data": "spacy.hu.stop_words"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.hu.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HungarianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "hu"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
| 
						 | 
				
			
			@ -26,6 +39,7 @@ class HungarianDefaults(Language.Defaults):
 | 
			
		|||
class Hungarian(Language):
 | 
			
		||||
    lang = "hu"
 | 
			
		||||
    Defaults = HungarianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Hungarian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,21 +1,33 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArmenianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "hy"
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "hy"
 | 
			
		||||
stop_words = {"@language_data": "spacy.hy.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.hy.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.hy.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Armenian(Language):
 | 
			
		||||
    lang = "hy"
 | 
			
		||||
    Defaults = ArmenianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Armenian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,21 +1,43 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.config import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "id"
 | 
			
		||||
stop_words = {"@language_data": "spacy.id.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.id.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.id.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class IndonesianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "id"
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
| 
						 | 
				
			
			@ -25,6 +47,7 @@ class IndonesianDefaults(Language.Defaults):
 | 
			
		|||
class Indonesian(Language):
 | 
			
		||||
    lang = "id"
 | 
			
		||||
    Defaults = IndonesianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Indonesian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class IcelandicDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "is"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "is"
 | 
			
		||||
stop_words = {"@language_data": "spacy.is.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.is.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Icelandic(Language):
 | 
			
		||||
    lang = "is"
 | 
			
		||||
    Defaults = IcelandicDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Icelandic"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,20 +1,34 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "it"
 | 
			
		||||
stop_words = {"@language_data": "spacy.it.stop_words"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.it.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ItalianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "it"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
| 
						 | 
				
			
			@ -24,6 +38,7 @@ class ItalianDefaults(Language.Defaults):
 | 
			
		|||
class Italian(Language):
 | 
			
		||||
    lang = "it"
 | 
			
		||||
    Defaults = ItalianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Italian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,21 +1,187 @@
 | 
			
		|||
from typing import Optional, Union, Dict, Any, Set
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
import srsly
 | 
			
		||||
from collections import namedtuple, OrderedDict
 | 
			
		||||
from collections import namedtuple
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
from .tag_map import TAG_MAP
 | 
			
		||||
from .tag_orth_map import TAG_ORTH_MAP
 | 
			
		||||
from .tag_bigram_map import TAG_BIGRAM_MAP
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...compat import copy_reg
 | 
			
		||||
from ...errors import Errors
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...symbols import POS
 | 
			
		||||
from ...tokens import Doc
 | 
			
		||||
from ...util import DummyTokenizer
 | 
			
		||||
from ...util import DummyTokenizer, registry
 | 
			
		||||
from ... import util
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ja"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ja.stop_words"}
 | 
			
		||||
 | 
			
		||||
[nlp.tokenizer]
 | 
			
		||||
@tokenizers = "spacy.JapaneseTokenizer.v1"
 | 
			
		||||
split_mode = null
 | 
			
		||||
 | 
			
		||||
[nlp.writing_system]
 | 
			
		||||
direction = "ltr"
 | 
			
		||||
has_case = false
 | 
			
		||||
has_letters = false
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ja.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
 | 
			
		||||
def create_japanese_tokenizer(split_mode: Optional[str] = None):
 | 
			
		||||
    def japanese_tokenizer_factory(nlp):
 | 
			
		||||
        return JapaneseTokenizer(nlp, split_mode=split_mode)
 | 
			
		||||
 | 
			
		||||
    return japanese_tokenizer_factory
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class JapaneseTokenizer(DummyTokenizer):
 | 
			
		||||
    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
 | 
			
		||||
        self.vocab = nlp.vocab
 | 
			
		||||
        self.split_mode = split_mode
 | 
			
		||||
        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
			
		||||
 | 
			
		||||
    def __call__(self, text: str) -> Doc:
 | 
			
		||||
        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
 | 
			
		||||
        sudachipy_tokens = self.tokenizer.tokenize(text)
 | 
			
		||||
        dtokens = self._get_dtokens(sudachipy_tokens)
 | 
			
		||||
        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
 | 
			
		||||
 | 
			
		||||
        # create Doc with tag bi-gram based part-of-speech identification rules
 | 
			
		||||
        words, tags, inflections, lemmas, readings, sub_tokens_list = (
 | 
			
		||||
            zip(*dtokens) if dtokens else [[]] * 6
 | 
			
		||||
        )
 | 
			
		||||
        sub_tokens_list = list(sub_tokens_list)
 | 
			
		||||
        doc = Doc(self.vocab, words=words, spaces=spaces)
 | 
			
		||||
        next_pos = None  # for bi-gram rules
 | 
			
		||||
        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
 | 
			
		||||
            token.tag_ = dtoken.tag
 | 
			
		||||
            if next_pos:  # already identified in previous iteration
 | 
			
		||||
                token.pos = next_pos
 | 
			
		||||
                next_pos = None
 | 
			
		||||
            else:
 | 
			
		||||
                token.pos, next_pos = resolve_pos(
 | 
			
		||||
                    token.orth_,
 | 
			
		||||
                    dtoken.tag,
 | 
			
		||||
                    tags[idx + 1] if idx + 1 < len(tags) else None,
 | 
			
		||||
                )
 | 
			
		||||
            # if there's no lemma info (it's an unk) just use the surface
 | 
			
		||||
            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
 | 
			
		||||
        doc.user_data["inflections"] = inflections
 | 
			
		||||
        doc.user_data["reading_forms"] = readings
 | 
			
		||||
        doc.user_data["sub_tokens"] = sub_tokens_list
 | 
			
		||||
        return doc
 | 
			
		||||
 | 
			
		||||
    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
 | 
			
		||||
        sub_tokens_list = (
 | 
			
		||||
            self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
 | 
			
		||||
        )
 | 
			
		||||
        dtokens = [
 | 
			
		||||
            DetailedToken(
 | 
			
		||||
                token.surface(),  # orth
 | 
			
		||||
                "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
 | 
			
		||||
                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
 | 
			
		||||
                token.dictionary_form(),  # lemma
 | 
			
		||||
                token.reading_form(),  # user_data['reading_forms']
 | 
			
		||||
                sub_tokens_list[idx]
 | 
			
		||||
                if sub_tokens_list
 | 
			
		||||
                else None,  # user_data['sub_tokens']
 | 
			
		||||
            )
 | 
			
		||||
            for idx, token in enumerate(sudachipy_tokens)
 | 
			
		||||
            if len(token.surface()) > 0
 | 
			
		||||
            # remove empty tokens which can be produced with characters like … that
 | 
			
		||||
        ]
 | 
			
		||||
        # Sudachi normalizes internally and outputs each space char as a token.
 | 
			
		||||
        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
 | 
			
		||||
        return [
 | 
			
		||||
            t
 | 
			
		||||
            for idx, t in enumerate(dtokens)
 | 
			
		||||
            if idx == 0
 | 
			
		||||
            or not t.surface.isspace()
 | 
			
		||||
            or t.tag != "空白"
 | 
			
		||||
            or not dtokens[idx - 1].surface.isspace()
 | 
			
		||||
            or dtokens[idx - 1].tag != "空白"
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
    def _get_sub_tokens(self, sudachipy_tokens):
 | 
			
		||||
        if (
 | 
			
		||||
            self.split_mode is None or self.split_mode == "A"
 | 
			
		||||
        ):  # do nothing for default split mode
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
 | 
			
		||||
        for token in sudachipy_tokens:
 | 
			
		||||
            sub_a = token.split(self.tokenizer.SplitMode.A)
 | 
			
		||||
            if len(sub_a) == 1:  # no sub tokens
 | 
			
		||||
                sub_tokens_list.append(None)
 | 
			
		||||
            elif self.split_mode == "B":
 | 
			
		||||
                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
 | 
			
		||||
            else:  # "C"
 | 
			
		||||
                sub_b = token.split(self.tokenizer.SplitMode.B)
 | 
			
		||||
                if len(sub_a) == len(sub_b):
 | 
			
		||||
                    dtokens = self._get_dtokens(sub_a, False)
 | 
			
		||||
                    sub_tokens_list.append([dtokens, dtokens])
 | 
			
		||||
                else:
 | 
			
		||||
                    sub_tokens_list.append(
 | 
			
		||||
                        [
 | 
			
		||||
                            self._get_dtokens(sub_a, False),
 | 
			
		||||
                            self._get_dtokens(sub_b, False),
 | 
			
		||||
                        ]
 | 
			
		||||
                    )
 | 
			
		||||
        return sub_tokens_list
 | 
			
		||||
 | 
			
		||||
    def _get_config(self) -> Dict[str, Any]:
 | 
			
		||||
        return {"split_mode": self.split_mode}
 | 
			
		||||
 | 
			
		||||
    def _set_config(self, config: Dict[str, Any] = {}) -> None:
 | 
			
		||||
        self.split_mode = config.get("split_mode", None)
 | 
			
		||||
 | 
			
		||||
    def to_bytes(self, **kwargs) -> bytes:
 | 
			
		||||
        serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
 | 
			
		||||
        return util.to_bytes(serializers, [])
 | 
			
		||||
 | 
			
		||||
    def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer":
 | 
			
		||||
        deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
 | 
			
		||||
        util.from_bytes(data, deserializers, [])
 | 
			
		||||
        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
 | 
			
		||||
        path = util.ensure_path(path)
 | 
			
		||||
        serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
 | 
			
		||||
        return util.to_disk(path, serializers, [])
 | 
			
		||||
 | 
			
		||||
    def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
 | 
			
		||||
        path = util.ensure_path(path)
 | 
			
		||||
        serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
 | 
			
		||||
        util.from_disk(path, serializers, [])
 | 
			
		||||
        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class JapaneseDefaults(Language.Defaults):
 | 
			
		||||
    tag_map = TAG_MAP
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Japanese(Language):
 | 
			
		||||
    lang = "ja"
 | 
			
		||||
    Defaults = JapaneseDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Hold the attributes we need with convenient names
 | 
			
		||||
DetailedToken = namedtuple(
 | 
			
		||||
    "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
 | 
			
		||||
| 
						 | 
				
			
			@ -133,161 +299,6 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
 | 
			
		|||
    return text_dtokens, text_spaces
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class JapaneseTokenizer(DummyTokenizer):
 | 
			
		||||
    def __init__(self, cls, nlp=None, config={}):
 | 
			
		||||
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
			
		||||
        self.split_mode = config.get("split_mode", None)
 | 
			
		||||
        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
			
		||||
 | 
			
		||||
    def __call__(self, text):
 | 
			
		||||
        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
 | 
			
		||||
        sudachipy_tokens = self.tokenizer.tokenize(text)
 | 
			
		||||
        dtokens = self._get_dtokens(sudachipy_tokens)
 | 
			
		||||
        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
 | 
			
		||||
 | 
			
		||||
        # create Doc with tag bi-gram based part-of-speech identification rules
 | 
			
		||||
        words, tags, inflections, lemmas, readings, sub_tokens_list = (
 | 
			
		||||
            zip(*dtokens) if dtokens else [[]] * 6
 | 
			
		||||
        )
 | 
			
		||||
        sub_tokens_list = list(sub_tokens_list)
 | 
			
		||||
        doc = Doc(self.vocab, words=words, spaces=spaces)
 | 
			
		||||
        next_pos = None  # for bi-gram rules
 | 
			
		||||
        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
 | 
			
		||||
            token.tag_ = dtoken.tag
 | 
			
		||||
            if next_pos:  # already identified in previous iteration
 | 
			
		||||
                token.pos = next_pos
 | 
			
		||||
                next_pos = None
 | 
			
		||||
            else:
 | 
			
		||||
                token.pos, next_pos = resolve_pos(
 | 
			
		||||
                    token.orth_,
 | 
			
		||||
                    dtoken.tag,
 | 
			
		||||
                    tags[idx + 1] if idx + 1 < len(tags) else None,
 | 
			
		||||
                )
 | 
			
		||||
            # if there's no lemma info (it's an unk) just use the surface
 | 
			
		||||
            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
 | 
			
		||||
 | 
			
		||||
        doc.user_data["inflections"] = inflections
 | 
			
		||||
        doc.user_data["reading_forms"] = readings
 | 
			
		||||
        doc.user_data["sub_tokens"] = sub_tokens_list
 | 
			
		||||
 | 
			
		||||
        return doc
 | 
			
		||||
 | 
			
		||||
    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
 | 
			
		||||
        sub_tokens_list = (
 | 
			
		||||
            self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
 | 
			
		||||
        )
 | 
			
		||||
        dtokens = [
 | 
			
		||||
            DetailedToken(
 | 
			
		||||
                token.surface(),  # orth
 | 
			
		||||
                "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
 | 
			
		||||
                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
 | 
			
		||||
                token.dictionary_form(),  # lemma
 | 
			
		||||
                token.reading_form(),  # user_data['reading_forms']
 | 
			
		||||
                sub_tokens_list[idx]
 | 
			
		||||
                if sub_tokens_list
 | 
			
		||||
                else None,  # user_data['sub_tokens']
 | 
			
		||||
            )
 | 
			
		||||
            for idx, token in enumerate(sudachipy_tokens)
 | 
			
		||||
            if len(token.surface()) > 0
 | 
			
		||||
            # remove empty tokens which can be produced with characters like … that
 | 
			
		||||
        ]
 | 
			
		||||
        # Sudachi normalizes internally and outputs each space char as a token.
 | 
			
		||||
        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
 | 
			
		||||
        return [
 | 
			
		||||
            t
 | 
			
		||||
            for idx, t in enumerate(dtokens)
 | 
			
		||||
            if idx == 0
 | 
			
		||||
            or not t.surface.isspace()
 | 
			
		||||
            or t.tag != "空白"
 | 
			
		||||
            or not dtokens[idx - 1].surface.isspace()
 | 
			
		||||
            or dtokens[idx - 1].tag != "空白"
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
    def _get_sub_tokens(self, sudachipy_tokens):
 | 
			
		||||
        if (
 | 
			
		||||
            self.split_mode is None or self.split_mode == "A"
 | 
			
		||||
        ):  # do nothing for default split mode
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
 | 
			
		||||
        for token in sudachipy_tokens:
 | 
			
		||||
            sub_a = token.split(self.tokenizer.SplitMode.A)
 | 
			
		||||
            if len(sub_a) == 1:  # no sub tokens
 | 
			
		||||
                sub_tokens_list.append(None)
 | 
			
		||||
            elif self.split_mode == "B":
 | 
			
		||||
                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
 | 
			
		||||
            else:  # "C"
 | 
			
		||||
                sub_b = token.split(self.tokenizer.SplitMode.B)
 | 
			
		||||
                if len(sub_a) == len(sub_b):
 | 
			
		||||
                    dtokens = self._get_dtokens(sub_a, False)
 | 
			
		||||
                    sub_tokens_list.append([dtokens, dtokens])
 | 
			
		||||
                else:
 | 
			
		||||
                    sub_tokens_list.append(
 | 
			
		||||
                        [
 | 
			
		||||
                            self._get_dtokens(sub_a, False),
 | 
			
		||||
                            self._get_dtokens(sub_b, False),
 | 
			
		||||
                        ]
 | 
			
		||||
                    )
 | 
			
		||||
        return sub_tokens_list
 | 
			
		||||
 | 
			
		||||
    def _get_config(self):
 | 
			
		||||
        config = OrderedDict((("split_mode", self.split_mode),))
 | 
			
		||||
        return config
 | 
			
		||||
 | 
			
		||||
    def _set_config(self, config={}):
 | 
			
		||||
        self.split_mode = config.get("split_mode", None)
 | 
			
		||||
 | 
			
		||||
    def to_bytes(self, **kwargs):
 | 
			
		||||
        serializers = OrderedDict(
 | 
			
		||||
            (("cfg", lambda: srsly.json_dumps(self._get_config())),)
 | 
			
		||||
        )
 | 
			
		||||
        return util.to_bytes(serializers, [])
 | 
			
		||||
 | 
			
		||||
    def from_bytes(self, data, **kwargs):
 | 
			
		||||
        deserializers = OrderedDict(
 | 
			
		||||
            (("cfg", lambda b: self._set_config(srsly.json_loads(b))),)
 | 
			
		||||
        )
 | 
			
		||||
        util.from_bytes(data, deserializers, [])
 | 
			
		||||
        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def to_disk(self, path, **kwargs):
 | 
			
		||||
        path = util.ensure_path(path)
 | 
			
		||||
        serializers = OrderedDict(
 | 
			
		||||
            (("cfg", lambda p: srsly.write_json(p, self._get_config())),)
 | 
			
		||||
        )
 | 
			
		||||
        return util.to_disk(path, serializers, [])
 | 
			
		||||
 | 
			
		||||
    def from_disk(self, path, **kwargs):
 | 
			
		||||
        path = util.ensure_path(path)
 | 
			
		||||
        serializers = OrderedDict(
 | 
			
		||||
            (("cfg", lambda p: self._set_config(srsly.read_json(p))),)
 | 
			
		||||
        )
 | 
			
		||||
        util.from_disk(path, serializers, [])
 | 
			
		||||
        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class JapaneseDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda _text: "ja"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    tag_map = TAG_MAP
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create_tokenizer(cls, nlp=None, config={}):
 | 
			
		||||
        return JapaneseTokenizer(cls, nlp, config)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Japanese(Language):
 | 
			
		||||
    lang = "ja"
 | 
			
		||||
    Defaults = JapaneseDefaults
 | 
			
		||||
 | 
			
		||||
    def make_doc(self, text):
 | 
			
		||||
        return self.tokenizer(text)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pickle_japanese(instance):
 | 
			
		||||
    return Japanese, tuple()
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class KannadaDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "kn"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "kn"
 | 
			
		||||
stop_words = {"@language_data": "spacy.kn.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.kn.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Kannada(Language):
 | 
			
		||||
    lang = "kn"
 | 
			
		||||
    Defaults = KannadaDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Kannada"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,51 +1,52 @@
 | 
			
		|||
from typing import Set, Optional, Any, Dict
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .tag_map import TAG_MAP
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...tokens import Doc
 | 
			
		||||
from ...compat import copy_reg
 | 
			
		||||
from ...util import DummyTokenizer
 | 
			
		||||
from ...util import DummyTokenizer, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def try_mecab_import():
 | 
			
		||||
    try:
 | 
			
		||||
        from natto import MeCab
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ko"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ko.stop_words"}
 | 
			
		||||
 | 
			
		||||
        return MeCab
 | 
			
		||||
    except ImportError:
 | 
			
		||||
        raise ImportError(
 | 
			
		||||
            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
 | 
			
		||||
            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
 | 
			
		||||
            "and [natto-py](https://github.com/buruzaemon/natto-py)"
 | 
			
		||||
        )
 | 
			
		||||
[nlp.tokenizer]
 | 
			
		||||
@tokenizers = "spacy.KoreanTokenizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.writing_system]
 | 
			
		||||
direction = "ltr"
 | 
			
		||||
has_case = false
 | 
			
		||||
has_letters = false
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# fmt: on
 | 
			
		||||
@registry.language_data("spacy.ko.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_spaces(text, tokens):
 | 
			
		||||
    prev_end = -1
 | 
			
		||||
    start = 0
 | 
			
		||||
    for token in tokens:
 | 
			
		||||
        idx = text.find(token, start)
 | 
			
		||||
        if prev_end > 0:
 | 
			
		||||
            yield prev_end != idx
 | 
			
		||||
        prev_end = idx + len(token)
 | 
			
		||||
        start = prev_end
 | 
			
		||||
    if start > 0:
 | 
			
		||||
        yield False
 | 
			
		||||
@registry.tokenizers("spacy.KoreanTokenizer.v1")
 | 
			
		||||
def create_korean_tokenizer():
 | 
			
		||||
    def korean_tokenizer_factory(nlp):
 | 
			
		||||
        return KoreanTokenizer(nlp)
 | 
			
		||||
 | 
			
		||||
    return korean_tokenizer_factory
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class KoreanTokenizer(DummyTokenizer):
 | 
			
		||||
    def __init__(self, cls, nlp=None):
 | 
			
		||||
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
			
		||||
    def __init__(self, nlp: Optional[Language] = None):
 | 
			
		||||
        self.vocab = nlp.vocab
 | 
			
		||||
        MeCab = try_mecab_import()
 | 
			
		||||
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
 | 
			
		||||
 | 
			
		||||
    def __del__(self):
 | 
			
		||||
        self.mecab_tokenizer.__del__()
 | 
			
		||||
 | 
			
		||||
    def __call__(self, text):
 | 
			
		||||
    def __call__(self, text: str) -> Doc:
 | 
			
		||||
        dtokens = list(self.detailed_tokens(text))
 | 
			
		||||
        surfaces = [dt["surface"] for dt in dtokens]
 | 
			
		||||
        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
 | 
			
		||||
| 
						 | 
				
			
			@ -56,7 +57,7 @@ class KoreanTokenizer(DummyTokenizer):
 | 
			
		|||
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
 | 
			
		||||
        return doc
 | 
			
		||||
 | 
			
		||||
    def detailed_tokens(self, text):
 | 
			
		||||
    def detailed_tokens(self, text: str) -> Dict[str, Any]:
 | 
			
		||||
        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
 | 
			
		||||
        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
 | 
			
		||||
        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
 | 
			
		||||
| 
						 | 
				
			
			@ -72,23 +73,39 @@ class KoreanTokenizer(DummyTokenizer):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
class KoreanDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda _text: "ko"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    tag_map = TAG_MAP
 | 
			
		||||
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create_tokenizer(cls, nlp=None):
 | 
			
		||||
        return KoreanTokenizer(cls, nlp)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Korean(Language):
 | 
			
		||||
    lang = "ko"
 | 
			
		||||
    Defaults = KoreanDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
    def make_doc(self, text):
 | 
			
		||||
        return self.tokenizer(text)
 | 
			
		||||
 | 
			
		||||
def try_mecab_import() -> None:
 | 
			
		||||
    try:
 | 
			
		||||
        from natto import MeCab
 | 
			
		||||
 | 
			
		||||
        return MeCab
 | 
			
		||||
    except ImportError:
 | 
			
		||||
        raise ImportError(
 | 
			
		||||
            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
 | 
			
		||||
            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
 | 
			
		||||
            "and [natto-py](https://github.com/buruzaemon/natto-py)"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_spaces(text, tokens):
 | 
			
		||||
    prev_end = -1
 | 
			
		||||
    start = 0
 | 
			
		||||
    for token in tokens:
 | 
			
		||||
        idx = text.find(token, start)
 | 
			
		||||
        if prev_end > 0:
 | 
			
		||||
            yield prev_end != idx
 | 
			
		||||
        prev_end = idx + len(token)
 | 
			
		||||
        start = prev_end
 | 
			
		||||
    if start > 0:
 | 
			
		||||
        yield False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pickle_korean(instance):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,26 +1,49 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "lb"
 | 
			
		||||
stop_words = {"@language_data": "spacy.lb.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.lb.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.lb.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class LuxembourgishDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "lb"
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Luxembourgish(Language):
 | 
			
		||||
    lang = "lb"
 | 
			
		||||
    Defaults = LuxembourgishDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Luxembourgish"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,4 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
import unicodedata
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -21,21 +22,21 @@ _tlds = set(
 | 
			
		|||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_punct(text):
 | 
			
		||||
def is_punct(text: str) -> bool:
 | 
			
		||||
    for char in text:
 | 
			
		||||
        if not unicodedata.category(char).startswith("P"):
 | 
			
		||||
            return False
 | 
			
		||||
    return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_ascii(text):
 | 
			
		||||
def is_ascii(text: str) -> bool:
 | 
			
		||||
    for char in text:
 | 
			
		||||
        if ord(char) >= 128:
 | 
			
		||||
            return False
 | 
			
		||||
    return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def like_num(text):
 | 
			
		||||
def like_num(text: str) -> bool:
 | 
			
		||||
    if text.startswith(("+", "-", "±", "~")):
 | 
			
		||||
        text = text[1:]
 | 
			
		||||
    # can be overwritten by lang with list of number words
 | 
			
		||||
| 
						 | 
				
			
			@ -49,64 +50,31 @@ def like_num(text):
 | 
			
		|||
    return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_bracket(text):
 | 
			
		||||
def is_bracket(text: str) -> bool:
 | 
			
		||||
    brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
 | 
			
		||||
    return text in brackets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_quote(text):
 | 
			
		||||
    quotes = (
 | 
			
		||||
        '"',
 | 
			
		||||
        "'",
 | 
			
		||||
        "`",
 | 
			
		||||
        "«",
 | 
			
		||||
        "»",
 | 
			
		||||
        "‘",
 | 
			
		||||
        "’",
 | 
			
		||||
        "‚",
 | 
			
		||||
        "‛",
 | 
			
		||||
        "“",
 | 
			
		||||
        "”",
 | 
			
		||||
        "„",
 | 
			
		||||
        "‟",
 | 
			
		||||
        "‹",
 | 
			
		||||
        "›",
 | 
			
		||||
        "❮",
 | 
			
		||||
        "❯",
 | 
			
		||||
        "''",
 | 
			
		||||
        "``",
 | 
			
		||||
    )
 | 
			
		||||
def is_quote(text: str) -> bool:
 | 
			
		||||
    # fmt: off
 | 
			
		||||
    quotes = ('"', "'", "`", "«", "»", "‘", "’", "‚", "‛", "“", "”", "„", "‟", "‹", "›", "❮", "❯", "''", "``")
 | 
			
		||||
    # fmt: on
 | 
			
		||||
    return text in quotes
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_left_punct(text):
 | 
			
		||||
    left_punct = (
 | 
			
		||||
        "(",
 | 
			
		||||
        "[",
 | 
			
		||||
        "{",
 | 
			
		||||
        "<",
 | 
			
		||||
        '"',
 | 
			
		||||
        "'",
 | 
			
		||||
        "«",
 | 
			
		||||
        "‘",
 | 
			
		||||
        "‚",
 | 
			
		||||
        "‛",
 | 
			
		||||
        "“",
 | 
			
		||||
        "„",
 | 
			
		||||
        "‟",
 | 
			
		||||
        "‹",
 | 
			
		||||
        "❮",
 | 
			
		||||
        "``",
 | 
			
		||||
    )
 | 
			
		||||
def is_left_punct(text: str) -> bool:
 | 
			
		||||
    # fmt: off
 | 
			
		||||
    left_punct = ("(", "[", "{", "<", '"', "'", "«", "‘", "‚", "‛", "“", "„", "‟", "‹", "❮", "``")
 | 
			
		||||
    # fmt: on
 | 
			
		||||
    return text in left_punct
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_right_punct(text):
 | 
			
		||||
def is_right_punct(text: str) -> bool:
 | 
			
		||||
    right_punct = (")", "]", "}", ">", '"', "'", "»", "’", "”", "›", "❯", "''")
 | 
			
		||||
    return text in right_punct
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_currency(text):
 | 
			
		||||
def is_currency(text: str) -> bool:
 | 
			
		||||
    # can be overwritten by lang with list of currency words, e.g. dollar, euro
 | 
			
		||||
    for char in text:
 | 
			
		||||
        if unicodedata.category(char) != "Sc":
 | 
			
		||||
| 
						 | 
				
			
			@ -114,11 +82,11 @@ def is_currency(text):
 | 
			
		|||
    return True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def like_email(text):
 | 
			
		||||
def like_email(text: str) -> bool:
 | 
			
		||||
    return bool(_like_email(text))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def like_url(text):
 | 
			
		||||
def like_url(text: str) -> bool:
 | 
			
		||||
    # We're looking for things that function in text like URLs. So, valid URL
 | 
			
		||||
    # or not, anything they say http:// is going to be good.
 | 
			
		||||
    if text.startswith("http://") or text.startswith("https://"):
 | 
			
		||||
| 
						 | 
				
			
			@ -144,7 +112,7 @@ def like_url(text):
 | 
			
		|||
    return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def word_shape(text):
 | 
			
		||||
def word_shape(text: str) -> str:
 | 
			
		||||
    if len(text) >= 100:
 | 
			
		||||
        return "LONG"
 | 
			
		||||
    shape = []
 | 
			
		||||
| 
						 | 
				
			
			@ -171,46 +139,52 @@ def word_shape(text):
 | 
			
		|||
    return "".join(shape)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def lower(string):
 | 
			
		||||
def lower(string: str) -> str:
 | 
			
		||||
    return string.lower()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def prefix(string):
 | 
			
		||||
def prefix(string: str) -> str:
 | 
			
		||||
    return string[0]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def suffix(string):
 | 
			
		||||
def suffix(string: str) -> str:
 | 
			
		||||
    return string[-3:]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_alpha(string):
 | 
			
		||||
def is_alpha(string: str) -> bool:
 | 
			
		||||
    return string.isalpha()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_digit(string):
 | 
			
		||||
def is_digit(string: str) -> bool:
 | 
			
		||||
    return string.isdigit()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_lower(string):
 | 
			
		||||
def is_lower(string: str) -> bool:
 | 
			
		||||
    return string.islower()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_space(string):
 | 
			
		||||
def is_space(string: str) -> bool:
 | 
			
		||||
    return string.isspace()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_title(string):
 | 
			
		||||
def is_title(string: str) -> bool:
 | 
			
		||||
    return string.istitle()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_upper(string):
 | 
			
		||||
def is_upper(string: str) -> bool:
 | 
			
		||||
    return string.isupper()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_stop(string, stops=set()):
 | 
			
		||||
def is_stop(string: str, stops: Set[str] = set()) -> bool:
 | 
			
		||||
    return string.lower() in stops
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_lang(text: str, lang: str = "") -> str:
 | 
			
		||||
    # This function is partially applied so lang code can be passed in
 | 
			
		||||
    # automatically while still allowing pickling
 | 
			
		||||
    return lang
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
LEX_ATTRS = {
 | 
			
		||||
    attrs.LOWER: lower,
 | 
			
		||||
    attrs.NORM: lower,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,28 +1,35 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "lij"
 | 
			
		||||
stop_words = {"@language_data": "spacy.lij.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.lij.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class LigurianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "lij"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Ligurian(Language):
 | 
			
		||||
    lang = "lij"
 | 
			
		||||
    Defaults = LigurianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Ligurian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,27 +1,41 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _return_lt(_):
 | 
			
		||||
    return "lt"
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "lt"
 | 
			
		||||
stop_words = {"@language_data": "spacy.lt.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.lt.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.lt.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class LithuanianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = _return_lt
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    mod_base_exceptions = {
 | 
			
		||||
| 
						 | 
				
			
			@ -29,12 +43,12 @@ class LithuanianDefaults(Language.Defaults):
 | 
			
		|||
    }
 | 
			
		||||
    del mod_base_exceptions["8)"]
 | 
			
		||||
    tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Lithuanian(Language):
 | 
			
		||||
    lang = "lt"
 | 
			
		||||
    Defaults = LithuanianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Lithuanian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class LatvianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "lv"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "lv"
 | 
			
		||||
stop_words = {"@language_data": "spacy.lv.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.lv.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Latvian(Language):
 | 
			
		||||
    lang = "lv"
 | 
			
		||||
    Defaults = LatvianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Latvian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,15 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MalayalamDefaults(Language.Defaults):
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ml"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ml.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ml.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Malayalam(Language):
 | 
			
		||||
    lang = "ml"
 | 
			
		||||
    Defaults = MalayalamDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Malayalam"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MarathiDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "mr"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "af"
 | 
			
		||||
stop_words = {"@language_data": "spacy.mr.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.mr.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Marathi(Language):
 | 
			
		||||
    lang = "mr"
 | 
			
		||||
    Defaults = MarathiDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Marathi"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,33 +1,47 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "nb"
 | 
			
		||||
stop_words = {"@language_data": "spacy.nb.stop_words"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.nb.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class NorwegianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "nb"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Norwegian(Language):
 | 
			
		||||
    lang = "nb"
 | 
			
		||||
    Defaults = NorwegianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Norwegian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,23 +1,33 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class NepaliDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ne"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ne.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ne.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ne.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Nepali(Language):
 | 
			
		||||
    lang = "ne"
 | 
			
		||||
    Defaults = NepaliDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Nepali"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,6 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
| 
						 | 
				
			
			@ -5,36 +8,51 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
			
		|||
from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
from .lemmatizer import DutchLemmatizer
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "nl"
 | 
			
		||||
stop_words = {"@language_data": "spacy.nl.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.DutchLemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.nl.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.nl.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
 | 
			
		||||
def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
 | 
			
		||||
    return DutchLemmatizer(data_paths=data_paths)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DutchDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "nl"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
			
		||||
        if lookups is None:
 | 
			
		||||
            lookups = Lookups()
 | 
			
		||||
        return DutchLemmatizer(lookups)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Dutch(Language):
 | 
			
		||||
    lang = "nl"
 | 
			
		||||
    Defaults = DutchDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Dutch"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,5 @@
 | 
			
		|||
from typing import Optional, List, Dict, Tuple
 | 
			
		||||
 | 
			
		||||
from ...lemmatizer import Lemmatizer
 | 
			
		||||
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -34,7 +36,9 @@ class DutchLemmatizer(Lemmatizer):
 | 
			
		|||
        "num": "num",
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
			
		||||
    def __call__(
 | 
			
		||||
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        # Difference 1: self.rules is assumed to be non-None, so no
 | 
			
		||||
        # 'is None' check required.
 | 
			
		||||
        # String lowercased from the get-go. All lemmatization results in
 | 
			
		||||
| 
						 | 
				
			
			@ -92,7 +96,7 @@ class DutchLemmatizer(Lemmatizer):
 | 
			
		|||
    # Overrides parent method so that a lowercased version of the string is
 | 
			
		||||
    # used to search the lookup table. This is necessary because our lookup
 | 
			
		||||
    # table consists entirely of lowercase keys.
 | 
			
		||||
    def lookup(self, string, orth=None):
 | 
			
		||||
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
			
		||||
        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
			
		||||
        string = string.lower()
 | 
			
		||||
        if orth is not None:
 | 
			
		||||
| 
						 | 
				
			
			@ -102,7 +106,13 @@ class DutchLemmatizer(Lemmatizer):
 | 
			
		|||
 | 
			
		||||
    # Reimplemented to focus more on application of suffix rules and to return
 | 
			
		||||
    # as early as possible.
 | 
			
		||||
    def lemmatize(self, string, index, exceptions, rules):
 | 
			
		||||
    def lemmatize(
 | 
			
		||||
        self,
 | 
			
		||||
        string: str,
 | 
			
		||||
        index: Dict[str, List[str]],
 | 
			
		||||
        exceptions: Dict[str, Dict[str, List[str]]],
 | 
			
		||||
        rules: Dict[str, List[List[str]]],
 | 
			
		||||
    ) -> Tuple[List[str], bool]:
 | 
			
		||||
        # returns (forms, is_known: bool)
 | 
			
		||||
        oov_forms = []
 | 
			
		||||
        for old, new in rules:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,43 +1,60 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .lemmatizer import PolishLemmatizer
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import add_lookups
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "pl"
 | 
			
		||||
stop_words = {"@language_data": "spacy.pl.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.PolishLemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.pl.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.pl.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
 | 
			
		||||
def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
 | 
			
		||||
    return PolishLemmatizer(data_paths=data_paths)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PolishDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "pl"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    mod_base_exceptions = {
 | 
			
		||||
        exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
 | 
			
		||||
    }
 | 
			
		||||
    tokenizer_exceptions = mod_base_exceptions
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
			
		||||
        if lookups is None:
 | 
			
		||||
            lookups = Lookups()
 | 
			
		||||
        return PolishLemmatizer(lookups)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Polish(Language):
 | 
			
		||||
    lang = "pl"
 | 
			
		||||
    Defaults = PolishDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Polish"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,5 @@
 | 
			
		|||
from typing import Optional, List, Dict
 | 
			
		||||
 | 
			
		||||
from ...lemmatizer import Lemmatizer
 | 
			
		||||
from ...parts_of_speech import NAMES
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -7,30 +9,29 @@ class PolishLemmatizer(Lemmatizer):
 | 
			
		|||
    # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
 | 
			
		||||
    # It utilizes some prefix based improvements for verb and adjectives
 | 
			
		||||
    # lemmatization, as well as case-sensitive lemmatization for nouns.
 | 
			
		||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
			
		||||
    def __call__(
 | 
			
		||||
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        if isinstance(univ_pos, int):
 | 
			
		||||
            univ_pos = NAMES.get(univ_pos, "X")
 | 
			
		||||
        univ_pos = univ_pos.upper()
 | 
			
		||||
 | 
			
		||||
        lookup_pos = univ_pos.lower()
 | 
			
		||||
        if univ_pos == "PROPN":
 | 
			
		||||
            lookup_pos = "noun"
 | 
			
		||||
        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
 | 
			
		||||
 | 
			
		||||
        if univ_pos == "NOUN":
 | 
			
		||||
            return self.lemmatize_noun(string, morphology, lookup_table)
 | 
			
		||||
 | 
			
		||||
        if univ_pos != "PROPN":
 | 
			
		||||
            string = string.lower()
 | 
			
		||||
 | 
			
		||||
        if univ_pos == "ADJ":
 | 
			
		||||
            return self.lemmatize_adj(string, morphology, lookup_table)
 | 
			
		||||
        elif univ_pos == "VERB":
 | 
			
		||||
            return self.lemmatize_verb(string, morphology, lookup_table)
 | 
			
		||||
 | 
			
		||||
        return [lookup_table.get(string, string.lower())]
 | 
			
		||||
 | 
			
		||||
    def lemmatize_adj(self, string, morphology, lookup_table):
 | 
			
		||||
    def lemmatize_adj(
 | 
			
		||||
        self, string: str, morphology: dict, lookup_table: Dict[str, str]
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        # this method utilizes different procedures for adjectives
 | 
			
		||||
        # with 'nie' and 'naj' prefixes
 | 
			
		||||
        if string[:3] == "nie":
 | 
			
		||||
| 
						 | 
				
			
			@ -41,25 +42,26 @@ class PolishLemmatizer(Lemmatizer):
 | 
			
		|||
                    return [lookup_table[naj_search_string]]
 | 
			
		||||
            if search_string in lookup_table:
 | 
			
		||||
                return [lookup_table[search_string]]
 | 
			
		||||
 | 
			
		||||
        if string[:3] == "naj":
 | 
			
		||||
            naj_search_string = string[3:]
 | 
			
		||||
            if naj_search_string in lookup_table:
 | 
			
		||||
                return [lookup_table[naj_search_string]]
 | 
			
		||||
 | 
			
		||||
        return [lookup_table.get(string, string)]
 | 
			
		||||
 | 
			
		||||
    def lemmatize_verb(self, string, morphology, lookup_table):
 | 
			
		||||
    def lemmatize_verb(
 | 
			
		||||
        self, string: str, morphology: dict, lookup_table: Dict[str, str]
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        # this method utilizes a different procedure for verbs
 | 
			
		||||
        # with 'nie' prefix
 | 
			
		||||
        if string[:3] == "nie":
 | 
			
		||||
            search_string = string[3:]
 | 
			
		||||
            if search_string in lookup_table:
 | 
			
		||||
                return [lookup_table[search_string]]
 | 
			
		||||
 | 
			
		||||
        return [lookup_table.get(string, string)]
 | 
			
		||||
 | 
			
		||||
    def lemmatize_noun(self, string, morphology, lookup_table):
 | 
			
		||||
    def lemmatize_noun(
 | 
			
		||||
        self, string: str, morphology: dict, lookup_table: Dict[str, str]
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        # this method is case-sensitive, in order to work
 | 
			
		||||
        # for incorrectly tagged proper names
 | 
			
		||||
        if string != string.lower():
 | 
			
		||||
| 
						 | 
				
			
			@ -68,11 +70,16 @@ class PolishLemmatizer(Lemmatizer):
 | 
			
		|||
            elif string in lookup_table:
 | 
			
		||||
                return [lookup_table[string]]
 | 
			
		||||
            return [string.lower()]
 | 
			
		||||
 | 
			
		||||
        return [lookup_table.get(string, string)]
 | 
			
		||||
 | 
			
		||||
    def lookup(self, string, orth=None):
 | 
			
		||||
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
			
		||||
        return string.lower()
 | 
			
		||||
 | 
			
		||||
    def lemmatize(self, string, index, exceptions, rules):
 | 
			
		||||
    def lemmatize(
 | 
			
		||||
        self,
 | 
			
		||||
        string: str,
 | 
			
		||||
        index: Dict[str, List[str]],
 | 
			
		||||
        exceptions: Dict[str, Dict[str, List[str]]],
 | 
			
		||||
        rules: Dict[str, List[List[str]]],
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        raise NotImplementedError
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,20 +1,42 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "pt"
 | 
			
		||||
stop_words = {"@language_data": "spacy.pt.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.pt.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.pt.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PortugueseDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "pt"
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -22,6 +44,7 @@ class PortugueseDefaults(Language.Defaults):
 | 
			
		|||
class Portuguese(Language):
 | 
			
		||||
    lang = "pt"
 | 
			
		||||
    Defaults = PortugueseDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Portuguese"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,27 +1,40 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
# Lemma data note:
 | 
			
		||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
 | 
			
		||||
# Replaced characters using cedillas with the correct ones (ș and ț)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ro"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ro.stop_words"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ro.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RomanianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "ro"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    prefixes = TOKENIZER_PREFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
| 
						 | 
				
			
			@ -30,6 +43,7 @@ class RomanianDefaults(Language.Defaults):
 | 
			
		|||
class Romanian(Language):
 | 
			
		||||
    lang = "ro"
 | 
			
		||||
    Defaults = RomanianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Romanian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,32 +1,49 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .lemmatizer import RussianLemmatizer
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ru"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ru.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.RussianLemmatizer.v1"
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ru.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ru.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
 | 
			
		||||
def create_russian_lemmatizer() -> RussianLemmatizer:
 | 
			
		||||
    return RussianLemmatizer()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RussianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "ru"
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
			
		||||
        if lookups is None:
 | 
			
		||||
            lookups = Lookups()
 | 
			
		||||
        return RussianLemmatizer(lookups)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Russian(Language):
 | 
			
		||||
    lang = "ru"
 | 
			
		||||
    Defaults = RussianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Russian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,11 +1,17 @@
 | 
			
		|||
from typing import Optional, Tuple, Dict, List
 | 
			
		||||
 | 
			
		||||
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
 | 
			
		||||
from ...lemmatizer import Lemmatizer
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
PUNCT_RULES = {"«": '"', "»": '"'}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RussianLemmatizer(Lemmatizer):
 | 
			
		||||
    _morph = None
 | 
			
		||||
 | 
			
		||||
    def __init__(self, lookups=None):
 | 
			
		||||
    def __init__(self, lookups: Optional[Lookups] = None) -> None:
 | 
			
		||||
        super(RussianLemmatizer, self).__init__(lookups)
 | 
			
		||||
        try:
 | 
			
		||||
            from pymorphy2 import MorphAnalyzer
 | 
			
		||||
| 
						 | 
				
			
			@ -19,15 +25,15 @@ class RussianLemmatizer(Lemmatizer):
 | 
			
		|||
        if RussianLemmatizer._morph is None:
 | 
			
		||||
            RussianLemmatizer._morph = MorphAnalyzer()
 | 
			
		||||
 | 
			
		||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
			
		||||
    def __call__(
 | 
			
		||||
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        univ_pos = self.normalize_univ_pos(univ_pos)
 | 
			
		||||
        if univ_pos == "PUNCT":
 | 
			
		||||
            return [PUNCT_RULES.get(string, string)]
 | 
			
		||||
 | 
			
		||||
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
 | 
			
		||||
            # Skip unchangeable pos
 | 
			
		||||
            return [string.lower()]
 | 
			
		||||
 | 
			
		||||
        analyses = self._morph.parse(string)
 | 
			
		||||
        filtered_analyses = []
 | 
			
		||||
        for analysis in analyses:
 | 
			
		||||
| 
						 | 
				
			
			@ -39,12 +45,10 @@ class RussianLemmatizer(Lemmatizer):
 | 
			
		|||
                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
 | 
			
		||||
            ):
 | 
			
		||||
                filtered_analyses.append(analysis)
 | 
			
		||||
 | 
			
		||||
        if not len(filtered_analyses):
 | 
			
		||||
            return [string.lower()]
 | 
			
		||||
        if morphology is None or (len(morphology) == 1 and POS in morphology):
 | 
			
		||||
            return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
			
		||||
 | 
			
		||||
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
 | 
			
		||||
            features_to_compare = ["Case", "Number", "Gender"]
 | 
			
		||||
        elif univ_pos == "NUM":
 | 
			
		||||
| 
						 | 
				
			
			@ -61,7 +65,6 @@ class RussianLemmatizer(Lemmatizer):
 | 
			
		|||
                "VerbForm",
 | 
			
		||||
                "Voice",
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
        analyses, filtered_analyses = filtered_analyses, []
 | 
			
		||||
        for analysis in analyses:
 | 
			
		||||
            _, analysis_morph = oc2ud(str(analysis.tag))
 | 
			
		||||
| 
						 | 
				
			
			@ -74,16 +77,14 @@ class RussianLemmatizer(Lemmatizer):
 | 
			
		|||
                    break
 | 
			
		||||
            else:
 | 
			
		||||
                filtered_analyses.append(analysis)
 | 
			
		||||
 | 
			
		||||
        if not len(filtered_analyses):
 | 
			
		||||
            return [string.lower()]
 | 
			
		||||
        return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def normalize_univ_pos(univ_pos):
 | 
			
		||||
    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
 | 
			
		||||
        if isinstance(univ_pos, str):
 | 
			
		||||
            return univ_pos.upper()
 | 
			
		||||
 | 
			
		||||
        symbols_to_str = {
 | 
			
		||||
            ADJ: "ADJ",
 | 
			
		||||
            DET: "DET",
 | 
			
		||||
| 
						 | 
				
			
			@ -98,14 +99,14 @@ class RussianLemmatizer(Lemmatizer):
 | 
			
		|||
            return symbols_to_str[univ_pos]
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    def lookup(self, string, orth=None):
 | 
			
		||||
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
			
		||||
        analyses = self._morph.parse(string)
 | 
			
		||||
        if len(analyses) == 1:
 | 
			
		||||
            return analyses[0].normal_form
 | 
			
		||||
        return string
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def oc2ud(oc_tag):
 | 
			
		||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
 | 
			
		||||
    gram_map = {
 | 
			
		||||
        "_POS": {
 | 
			
		||||
            "ADJF": "ADJ",
 | 
			
		||||
| 
						 | 
				
			
			@ -160,11 +161,9 @@ def oc2ud(oc_tag):
 | 
			
		|||
        "Voice": {"actv": "Act", "pssv": "Pass"},
 | 
			
		||||
        "Abbr": {"Abbr": "Yes"},
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pos = "X"
 | 
			
		||||
    morphology = dict()
 | 
			
		||||
    unmatched = set()
 | 
			
		||||
 | 
			
		||||
    grams = oc_tag.replace(" ", ",").split(",")
 | 
			
		||||
    for gram in grams:
 | 
			
		||||
        match = False
 | 
			
		||||
| 
						 | 
				
			
			@ -177,7 +176,6 @@ def oc2ud(oc_tag):
 | 
			
		|||
                    morphology[categ] = gmap[gram]
 | 
			
		||||
        if not match:
 | 
			
		||||
            unmatched.add(gram)
 | 
			
		||||
 | 
			
		||||
    while len(unmatched) > 0:
 | 
			
		||||
        gram = unmatched.pop()
 | 
			
		||||
        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
 | 
			
		||||
| 
						 | 
				
			
			@ -186,8 +184,4 @@ def oc2ud(oc_tag):
 | 
			
		|||
            pos = "AUX"
 | 
			
		||||
        elif gram == "Pltm":
 | 
			
		||||
            morphology["Number"] = "Ptan"
 | 
			
		||||
 | 
			
		||||
    return pos, morphology
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
PUNCT_RULES = {"«": '"', "»": '"'}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,20 +1,33 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SinhalaDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "si"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "si"
 | 
			
		||||
stop_words = {"@language_data": "spacy.si.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.si.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.si.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Sinhala(Language):
 | 
			
		||||
    lang = "si"
 | 
			
		||||
    Defaults = SinhalaDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Sinhala"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,20 +1,33 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SlovakDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "sk"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "sk"
 | 
			
		||||
stop_words = {"@language_data": "spacy.sk.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.sk.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.sk.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Slovak(Language):
 | 
			
		||||
    lang = "sk"
 | 
			
		||||
    Defaults = SlovakDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Slovak"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SlovenianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "sl"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "sl"
 | 
			
		||||
stop_words = {"@language_data": "spacy.sl.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.sl.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Slovenian(Language):
 | 
			
		||||
    lang = "sl"
 | 
			
		||||
    Defaults = SlovenianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Slovenian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,26 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class AlbanianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "sq"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "sq"
 | 
			
		||||
stop_words = {"@language_data": "spacy.sq.stop_words"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.sq.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Albanian(Language):
 | 
			
		||||
    lang = "sq"
 | 
			
		||||
    Defaults = AlbanianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Albanian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,23 +1,47 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "sr"
 | 
			
		||||
stop_words = {"@language_data": "spacy.sr.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.sr.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.sr.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SerbianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "sr"
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Serbian(Language):
 | 
			
		||||
    lang = "sr"
 | 
			
		||||
    Defaults = SerbianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Serbian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,35 +1,54 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
# Punctuation stolen from Danish
 | 
			
		||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "sv"
 | 
			
		||||
stop_words = {"@language_data": "spacy.sv.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.sv.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.sv.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SwedishDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "sv"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Swedish(Language):
 | 
			
		||||
    lang = "sv"
 | 
			
		||||
    Defaults = SwedishDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Swedish"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,20 +1,33 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TamilDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "ta"
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ta"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ta.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ta.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ta.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Tamil(Language):
 | 
			
		||||
    lang = "ta"
 | 
			
		||||
    Defaults = TamilDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Tamil"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,20 +1,33 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TeluguDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "te"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "te"
 | 
			
		||||
stop_words = {"@language_data": "spacy.te.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.te.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.te.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Telugu(Language):
 | 
			
		||||
    lang = "te"
 | 
			
		||||
    Defaults = TeluguDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Telugu"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,15 +1,44 @@
 | 
			
		|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...tokens import Doc
 | 
			
		||||
from ...util import DummyTokenizer
 | 
			
		||||
from ...util import DummyTokenizer, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "th"
 | 
			
		||||
stop_words = {"@language_data": "spacy.th.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.tokenizer]
 | 
			
		||||
@tokenizers = "spacy.ThaiTokenizer.v1"
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.th.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.th.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.tokenizers("spacy.ThaiTokenizer.v1")
 | 
			
		||||
def create_thai_tokenizer():
 | 
			
		||||
    def thai_tokenizer_factory(nlp):
 | 
			
		||||
        return ThaiTokenizer(nlp)
 | 
			
		||||
 | 
			
		||||
    return thai_tokenizer_factory
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ThaiTokenizer(DummyTokenizer):
 | 
			
		||||
    def __init__(self, cls, nlp=None):
 | 
			
		||||
    def __init__(self, nlp: Language) -> None:
 | 
			
		||||
        try:
 | 
			
		||||
            from pythainlp.tokenize import word_tokenize
 | 
			
		||||
        except ImportError:
 | 
			
		||||
| 
						 | 
				
			
			@ -17,34 +46,18 @@ class ThaiTokenizer(DummyTokenizer):
 | 
			
		|||
                "The Thai tokenizer requires the PyThaiNLP library: "
 | 
			
		||||
                "https://github.com/PyThaiNLP/pythainlp"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        self.word_tokenize = word_tokenize
 | 
			
		||||
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
			
		||||
        self.vocab = nlp.vocab
 | 
			
		||||
 | 
			
		||||
    def __call__(self, text):
 | 
			
		||||
    def __call__(self, text: str) -> Doc:
 | 
			
		||||
        words = list(self.word_tokenize(text))
 | 
			
		||||
        spaces = [False] * len(words)
 | 
			
		||||
        return Doc(self.vocab, words=words, spaces=spaces)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ThaiDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda _text: "th"
 | 
			
		||||
    tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create_tokenizer(cls, nlp=None):
 | 
			
		||||
        return ThaiTokenizer(cls, nlp)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Thai(Language):
 | 
			
		||||
    lang = "th"
 | 
			
		||||
    Defaults = ThaiDefaults
 | 
			
		||||
 | 
			
		||||
    def make_doc(self, text):
 | 
			
		||||
        return self.tokenizer(text)
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Thai"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,31 +1,47 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _return_tl(_):
 | 
			
		||||
    return "tl"
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "tl"
 | 
			
		||||
stop_words = {"@language_data": "spacy.tl.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.tl.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.tl.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TagalogDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = _return_tl
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Tagalog(Language):
 | 
			
		||||
    lang = "tl"
 | 
			
		||||
    Defaults = TagalogDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Tagalog"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,26 +1,40 @@
 | 
			
		|||
from typing import Set
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "tr"
 | 
			
		||||
stop_words = {"@language_data": "spacy.tr.stop_words"}
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.tr.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TurkishDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "tr"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Turkish(Language):
 | 
			
		||||
    lang = "tr"
 | 
			
		||||
    Defaults = TurkishDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Turkish"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,28 +1,42 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...util import update_exc
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "tt"
 | 
			
		||||
stop_words = {"@language_data": "spacy.tt.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.tt.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.tt.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TatarDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "tt"
 | 
			
		||||
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    infixes = tuple(TOKENIZER_INFIXES)
 | 
			
		||||
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Tatar(Language):
 | 
			
		||||
    lang = "tt"
 | 
			
		||||
    Defaults = TatarDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Tatar"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,36 +1,49 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
from ...util import update_exc, registry
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from .lemmatizer import UkrainianLemmatizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class UkrainianDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "uk"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "uk"
 | 
			
		||||
stop_words = {"@language_data": "spacy.uk.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
			
		||||
        if lookups is None:
 | 
			
		||||
            lookups = Lookups()
 | 
			
		||||
        return UkrainianLemmatizer(lookups)
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.UkrainianLemmatizer.v1"
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.uk.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.uk.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
 | 
			
		||||
def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
 | 
			
		||||
    return UkrainianLemmatizer()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class UkrainianDefaults(Language.Defaults):
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Ukrainian(Language):
 | 
			
		||||
    lang = "uk"
 | 
			
		||||
    Defaults = UkrainianDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Ukrainian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,11 +1,17 @@
 | 
			
		|||
from typing import Optional, List, Tuple, Dict
 | 
			
		||||
 | 
			
		||||
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...lemmatizer import Lemmatizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
PUNCT_RULES = {"«": '"', "»": '"'}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class UkrainianLemmatizer(Lemmatizer):
 | 
			
		||||
    _morph = None
 | 
			
		||||
 | 
			
		||||
    def __init__(self, lookups=None):
 | 
			
		||||
    def __init__(self, lookups: Optional[Lookups] = None) -> None:
 | 
			
		||||
        super(UkrainianLemmatizer, self).__init__(lookups)
 | 
			
		||||
        try:
 | 
			
		||||
            from pymorphy2 import MorphAnalyzer
 | 
			
		||||
| 
						 | 
				
			
			@ -19,15 +25,15 @@ class UkrainianLemmatizer(Lemmatizer):
 | 
			
		|||
                '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
			
		||||
    def __call__(
 | 
			
		||||
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        univ_pos = self.normalize_univ_pos(univ_pos)
 | 
			
		||||
        if univ_pos == "PUNCT":
 | 
			
		||||
            return [PUNCT_RULES.get(string, string)]
 | 
			
		||||
 | 
			
		||||
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
 | 
			
		||||
            # Skip unchangeable pos
 | 
			
		||||
            return [string.lower()]
 | 
			
		||||
 | 
			
		||||
        analyses = self._morph.parse(string)
 | 
			
		||||
        filtered_analyses = []
 | 
			
		||||
        for analysis in analyses:
 | 
			
		||||
| 
						 | 
				
			
			@ -39,12 +45,10 @@ class UkrainianLemmatizer(Lemmatizer):
 | 
			
		|||
                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
 | 
			
		||||
            ):
 | 
			
		||||
                filtered_analyses.append(analysis)
 | 
			
		||||
 | 
			
		||||
        if not len(filtered_analyses):
 | 
			
		||||
            return [string.lower()]
 | 
			
		||||
        if morphology is None or (len(morphology) == 1 and POS in morphology):
 | 
			
		||||
            return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
			
		||||
 | 
			
		||||
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
 | 
			
		||||
            features_to_compare = ["Case", "Number", "Gender"]
 | 
			
		||||
        elif univ_pos == "NUM":
 | 
			
		||||
| 
						 | 
				
			
			@ -61,7 +65,6 @@ class UkrainianLemmatizer(Lemmatizer):
 | 
			
		|||
                "VerbForm",
 | 
			
		||||
                "Voice",
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
        analyses, filtered_analyses = filtered_analyses, []
 | 
			
		||||
        for analysis in analyses:
 | 
			
		||||
            _, analysis_morph = oc2ud(str(analysis.tag))
 | 
			
		||||
| 
						 | 
				
			
			@ -74,16 +77,14 @@ class UkrainianLemmatizer(Lemmatizer):
 | 
			
		|||
                    break
 | 
			
		||||
            else:
 | 
			
		||||
                filtered_analyses.append(analysis)
 | 
			
		||||
 | 
			
		||||
        if not len(filtered_analyses):
 | 
			
		||||
            return [string.lower()]
 | 
			
		||||
        return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def normalize_univ_pos(univ_pos):
 | 
			
		||||
    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
 | 
			
		||||
        if isinstance(univ_pos, str):
 | 
			
		||||
            return univ_pos.upper()
 | 
			
		||||
 | 
			
		||||
        symbols_to_str = {
 | 
			
		||||
            ADJ: "ADJ",
 | 
			
		||||
            DET: "DET",
 | 
			
		||||
| 
						 | 
				
			
			@ -98,14 +99,14 @@ class UkrainianLemmatizer(Lemmatizer):
 | 
			
		|||
            return symbols_to_str[univ_pos]
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    def lookup(self, string, orth=None):
 | 
			
		||||
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
			
		||||
        analyses = self._morph.parse(string)
 | 
			
		||||
        if len(analyses) == 1:
 | 
			
		||||
            return analyses[0].normal_form
 | 
			
		||||
        return string
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def oc2ud(oc_tag):
 | 
			
		||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
 | 
			
		||||
    gram_map = {
 | 
			
		||||
        "_POS": {
 | 
			
		||||
            "ADJF": "ADJ",
 | 
			
		||||
| 
						 | 
				
			
			@ -160,11 +161,9 @@ def oc2ud(oc_tag):
 | 
			
		|||
        "Voice": {"actv": "Act", "pssv": "Pass"},
 | 
			
		||||
        "Abbr": {"Abbr": "Yes"},
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pos = "X"
 | 
			
		||||
    morphology = dict()
 | 
			
		||||
    unmatched = set()
 | 
			
		||||
 | 
			
		||||
    grams = oc_tag.replace(" ", ",").split(",")
 | 
			
		||||
    for gram in grams:
 | 
			
		||||
        match = False
 | 
			
		||||
| 
						 | 
				
			
			@ -177,7 +176,6 @@ def oc2ud(oc_tag):
 | 
			
		|||
                    morphology[categ] = gmap[gram]
 | 
			
		||||
        if not match:
 | 
			
		||||
            unmatched.add(gram)
 | 
			
		||||
 | 
			
		||||
    while len(unmatched) > 0:
 | 
			
		||||
        gram = unmatched.pop()
 | 
			
		||||
        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
 | 
			
		||||
| 
						 | 
				
			
			@ -186,8 +184,4 @@ def oc2ud(oc_tag):
 | 
			
		|||
            pos = "AUX"
 | 
			
		||||
        elif gram == "Pltm":
 | 
			
		||||
            morphology["Number"] = "Ptan"
 | 
			
		||||
 | 
			
		||||
    return pos, morphology
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
PUNCT_RULES = {"«": '"', "»": '"'}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,26 +1,53 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "ur"
 | 
			
		||||
stop_words = {"@language_data": "spacy.ur.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.writing_system]
 | 
			
		||||
direction = "rtl"
 | 
			
		||||
has_case = false
 | 
			
		||||
has_letters = true
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer]
 | 
			
		||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.lemmatizer.data_paths]
 | 
			
		||||
@language_data = "spacy-lookups-data"
 | 
			
		||||
lang = ${nlp:lang}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ur.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.ur.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class UrduDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "ur"
 | 
			
		||||
 | 
			
		||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Urdu(Language):
 | 
			
		||||
    lang = "ur"
 | 
			
		||||
    Defaults = UrduDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Urdu"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,38 +1,62 @@
 | 
			
		|||
from ...attrs import LANG, NORM
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...tokens import Doc
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...util import add_lookups
 | 
			
		||||
from ...util import DummyTokenizer, registry
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class VietnameseDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "vi"  # for pickling
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    use_pyvi = True
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "vi"
 | 
			
		||||
stop_words = {"@language_data": "spacy.vi.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
[nlp.tokenizer]
 | 
			
		||||
@tokenizers = "spacy.VietnameseTokenizer.v1"
 | 
			
		||||
use_pyvi = true
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Vietnamese(Language):
 | 
			
		||||
    lang = "vi"
 | 
			
		||||
    Defaults = VietnameseDefaults  # override defaults
 | 
			
		||||
@registry.language_data("spacy.vi.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
    def make_doc(self, text):
 | 
			
		||||
        if self.Defaults.use_pyvi:
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.vi.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.tokenizers("spacy.VietnameseTokenizer.v1")
 | 
			
		||||
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
 | 
			
		||||
    def vietnamese_tokenizer_factory(nlp):
 | 
			
		||||
        return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
 | 
			
		||||
 | 
			
		||||
    return vietnamese_tokenizer_factory
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class VietnameseTokenizer(DummyTokenizer):
 | 
			
		||||
    def __init__(self, nlp: Language, use_pyvi: bool = False):
 | 
			
		||||
        self.vocab = nlp.vocab
 | 
			
		||||
        self.use_pyvi = use_pyvi
 | 
			
		||||
        if self.use_pyvi:
 | 
			
		||||
            try:
 | 
			
		||||
                from pyvi import ViTokenizer
 | 
			
		||||
 | 
			
		||||
                self.ViTokenizer = ViTokenizer
 | 
			
		||||
            except ImportError:
 | 
			
		||||
                msg = (
 | 
			
		||||
                    "Pyvi not installed. Either set Vietnamese.use_pyvi = False, "
 | 
			
		||||
                    "Pyvi not installed. Either set use_pyvi = False, "
 | 
			
		||||
                    "or install it https://pypi.python.org/pypi/pyvi"
 | 
			
		||||
                )
 | 
			
		||||
                raise ImportError(msg)
 | 
			
		||||
            words, spaces = ViTokenizer.spacy_tokenize(text)
 | 
			
		||||
 | 
			
		||||
    def __call__(self, text: str) -> Doc:
 | 
			
		||||
        if self.use_pyvi:
 | 
			
		||||
            words, spaces = self.ViTokenizer.spacy_tokenize(text)
 | 
			
		||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
			
		||||
        else:
 | 
			
		||||
            words = []
 | 
			
		||||
| 
						 | 
				
			
			@ -44,4 +68,9 @@ class Vietnamese(Language):
 | 
			
		|||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Vietnamese(Language):
 | 
			
		||||
    lang = "vi"
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Vietnamese"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,17 +1,17 @@
 | 
			
		|||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ..norm_exceptions import BASE_NORMS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG, NORM
 | 
			
		||||
from ...util import update_exc, add_lookups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "xx"
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MultiLanguageDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "xx"
 | 
			
		||||
    lex_attr_getters[NORM] = add_lookups(
 | 
			
		||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
			
		||||
    )
 | 
			
		||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 | 
			
		||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MultiLanguage(Language):
 | 
			
		||||
| 
						 | 
				
			
			@ -21,6 +21,7 @@ class MultiLanguage(Language):
 | 
			
		|||
 | 
			
		||||
    lang = "xx"
 | 
			
		||||
    Defaults = MultiLanguageDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["MultiLanguage"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,21 +1,39 @@
 | 
			
		|||
from typing import Set, Dict, Callable, Any
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from ...util import registry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "si"
 | 
			
		||||
stop_words = {"@language_data": "spacy.yo.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.yo.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.yo.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class YorubaDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "yo"
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Yoruba(Language):
 | 
			
		||||
    lang = "yo"
 | 
			
		||||
    Defaults = YorubaDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Yoruba"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,13 +1,15 @@
 | 
			
		|||
from typing import Optional, List, Set, Dict, Callable, Any
 | 
			
		||||
from enum import Enum
 | 
			
		||||
import tempfile
 | 
			
		||||
import srsly
 | 
			
		||||
import warnings
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from collections import OrderedDict
 | 
			
		||||
from ...attrs import LANG
 | 
			
		||||
from thinc.api import Config
 | 
			
		||||
 | 
			
		||||
from ...errors import Warnings, Errors
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...tokens import Doc
 | 
			
		||||
from ...util import DummyTokenizer
 | 
			
		||||
from ...util import DummyTokenizer, registry
 | 
			
		||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
| 
						 | 
				
			
			@ -16,88 +18,103 @@ from ... import util
 | 
			
		|||
 | 
			
		||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
 | 
			
		||||
 | 
			
		||||
DEFAULT_CONFIG = """
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "zh"
 | 
			
		||||
stop_words = {"@language_data": "spacy.zh.stop_words"}
 | 
			
		||||
lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
 | 
			
		||||
 | 
			
		||||
def try_jieba_import(segmenter):
 | 
			
		||||
    try:
 | 
			
		||||
        import jieba
 | 
			
		||||
[nlp.tokenizer]
 | 
			
		||||
@tokenizers = "spacy.ChineseTokenizer.v1"
 | 
			
		||||
segmenter = "char"
 | 
			
		||||
pkuseg_model = null
 | 
			
		||||
pkuseg_user_dict = "default"
 | 
			
		||||
 | 
			
		||||
        if segmenter == "jieba":
 | 
			
		||||
            # segment a short text to have jieba initialize its cache in advance
 | 
			
		||||
            list(jieba.cut("作为", cut_all=False))
 | 
			
		||||
 | 
			
		||||
        return jieba
 | 
			
		||||
    except ImportError:
 | 
			
		||||
        if segmenter == "jieba":
 | 
			
		||||
            msg = (
 | 
			
		||||
                "Jieba not installed. To use jieba, install it with `pip "
 | 
			
		||||
                " install jieba` or from https://github.com/fxsjy/jieba"
 | 
			
		||||
            )
 | 
			
		||||
            raise ImportError(msg)
 | 
			
		||||
[nlp.writing_system]
 | 
			
		||||
direction = "ltr"
 | 
			
		||||
has_case = false
 | 
			
		||||
has_letters = false
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
 | 
			
		||||
    try:
 | 
			
		||||
        import pkuseg
 | 
			
		||||
class Segmenter(str, Enum):
 | 
			
		||||
    char = "char"
 | 
			
		||||
    jieba = "jieba"
 | 
			
		||||
    pkuseg = "pkuseg"
 | 
			
		||||
 | 
			
		||||
        if pkuseg_model:
 | 
			
		||||
            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
 | 
			
		||||
        elif segmenter == "pkuseg":
 | 
			
		||||
            msg = (
 | 
			
		||||
                "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
 | 
			
		||||
                "was specified. Please provide the name of a pretrained model "
 | 
			
		||||
                "or the path to a model with "
 | 
			
		||||
                '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
 | 
			
		||||
                'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
 | 
			
		||||
            )
 | 
			
		||||
            raise ValueError(msg)
 | 
			
		||||
    except ImportError:
 | 
			
		||||
        if segmenter == "pkuseg":
 | 
			
		||||
            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
 | 
			
		||||
            raise ImportError(msg)
 | 
			
		||||
    except FileNotFoundError:
 | 
			
		||||
        if segmenter == "pkuseg":
 | 
			
		||||
            msg = "Unable to load pkuseg model from: " + pkuseg_model
 | 
			
		||||
            raise FileNotFoundError(msg)
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def values(cls):
 | 
			
		||||
        return list(cls.__members__.keys())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.zh.stop_words")
 | 
			
		||||
def stop_words() -> Set[str]:
 | 
			
		||||
    return STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy.zh.lex_attr_getters")
 | 
			
		||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
			
		||||
    return LEX_ATTRS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.tokenizers("spacy.ChineseTokenizer.v1")
 | 
			
		||||
def create_chinese_tokenizer(
 | 
			
		||||
    segmenter: Segmenter = Segmenter.char,
 | 
			
		||||
    pkuseg_model: Optional[str] = None,
 | 
			
		||||
    pkuseg_user_dict: Optional[str] = "default",
 | 
			
		||||
):
 | 
			
		||||
    def chinese_tokenizer_factory(nlp):
 | 
			
		||||
        return ChineseTokenizer(
 | 
			
		||||
            nlp,
 | 
			
		||||
            segmenter=segmenter,
 | 
			
		||||
            pkuseg_model=pkuseg_model,
 | 
			
		||||
            pkuseg_user_dict=pkuseg_user_dict,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    return chinese_tokenizer_factory
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ChineseTokenizer(DummyTokenizer):
 | 
			
		||||
    def __init__(self, cls, nlp=None, config={}):
 | 
			
		||||
        self.supported_segmenters = ("char", "jieba", "pkuseg")
 | 
			
		||||
        self.configure_segmenter(config)
 | 
			
		||||
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
			
		||||
        # remove relevant settings from config so they're not also saved in
 | 
			
		||||
        # Language.meta
 | 
			
		||||
        for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
 | 
			
		||||
            if key in config:
 | 
			
		||||
                del config[key]
 | 
			
		||||
        self.tokenizer = Language.Defaults().create_tokenizer(nlp)
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        nlp: Language,
 | 
			
		||||
        segmenter: Segmenter = Segmenter.char,
 | 
			
		||||
        pkuseg_model: Optional[str] = None,
 | 
			
		||||
        pkuseg_user_dict: Optional[str] = None,
 | 
			
		||||
    ):
 | 
			
		||||
        self.vocab = nlp.vocab
 | 
			
		||||
        if isinstance(segmenter, Segmenter):  # we might have the Enum here
 | 
			
		||||
            segmenter = segmenter.value
 | 
			
		||||
        self.segmenter = segmenter
 | 
			
		||||
        self.pkuseg_model = pkuseg_model
 | 
			
		||||
        self.pkuseg_user_dict = pkuseg_user_dict
 | 
			
		||||
        self.pkuseg_seg = None
 | 
			
		||||
        self.jieba_seg = None
 | 
			
		||||
        self.configure_segmenter(segmenter)
 | 
			
		||||
 | 
			
		||||
    def configure_segmenter(self, config):
 | 
			
		||||
        self.segmenter = "char"
 | 
			
		||||
        if "segmenter" in config:
 | 
			
		||||
            if config["segmenter"] in self.supported_segmenters:
 | 
			
		||||
                self.segmenter = config["segmenter"]
 | 
			
		||||
            else:
 | 
			
		||||
                warn_msg = Warnings.W103.format(
 | 
			
		||||
                    lang="Chinese",
 | 
			
		||||
                    segmenter=config["segmenter"],
 | 
			
		||||
                    supported=", ".join([repr(s) for s in self.supported_segmenters]),
 | 
			
		||||
                    default="'char' (character segmentation)",
 | 
			
		||||
                )
 | 
			
		||||
                warnings.warn(warn_msg)
 | 
			
		||||
    def configure_segmenter(self, segmenter: str):
 | 
			
		||||
        if segmenter not in Segmenter.values():
 | 
			
		||||
            warn_msg = Warnings.W103.format(
 | 
			
		||||
                lang="Chinese",
 | 
			
		||||
                segmenter=segmenter,
 | 
			
		||||
                supported=", ".join(Segmenter.values()),
 | 
			
		||||
                default="'char' (character segmentation)",
 | 
			
		||||
            )
 | 
			
		||||
            warnings.warn(warn_msg)
 | 
			
		||||
            self.segmenter = Segmenter.char
 | 
			
		||||
        self.jieba_seg = try_jieba_import(self.segmenter)
 | 
			
		||||
        self.pkuseg_seg = try_pkuseg_import(
 | 
			
		||||
            self.segmenter,
 | 
			
		||||
            pkuseg_model=config.get("pkuseg_model", None),
 | 
			
		||||
            pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
 | 
			
		||||
            pkuseg_model=self.pkuseg_model,
 | 
			
		||||
            pkuseg_user_dict=self.pkuseg_user_dict,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def __call__(self, text):
 | 
			
		||||
        if self.segmenter == "jieba":
 | 
			
		||||
    def __call__(self, text: str) -> Doc:
 | 
			
		||||
        if self.segmenter == Segmenter.jieba:
 | 
			
		||||
            words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
 | 
			
		||||
            (words, spaces) = util.get_words_and_spaces(words, text)
 | 
			
		||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
			
		||||
        elif self.segmenter == "pkuseg":
 | 
			
		||||
        elif self.segmenter == Segmenter.pkuseg:
 | 
			
		||||
            if self.pkuseg_seg is None:
 | 
			
		||||
                raise ValueError(Errors.E1000)
 | 
			
		||||
            words = self.pkuseg_seg.cut(text)
 | 
			
		||||
| 
						 | 
				
			
			@ -105,11 +122,11 @@ class ChineseTokenizer(DummyTokenizer):
 | 
			
		|||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
			
		||||
 | 
			
		||||
        # warn if segmenter setting is not the only remaining option "char"
 | 
			
		||||
        if self.segmenter != "char":
 | 
			
		||||
        if self.segmenter != Segmenter.char:
 | 
			
		||||
            warn_msg = Warnings.W103.format(
 | 
			
		||||
                lang="Chinese",
 | 
			
		||||
                segmenter=self.segmenter,
 | 
			
		||||
                supported=", ".join([repr(s) for s in self.supported_segmenters]),
 | 
			
		||||
                supported=", ".join(Segmenter.values()),
 | 
			
		||||
                default="'char' (character segmentation)",
 | 
			
		||||
            )
 | 
			
		||||
            warnings.warn(warn_msg)
 | 
			
		||||
| 
						 | 
				
			
			@ -119,33 +136,25 @@ class ChineseTokenizer(DummyTokenizer):
 | 
			
		|||
        (words, spaces) = util.get_words_and_spaces(words, text)
 | 
			
		||||
        return Doc(self.vocab, words=words, spaces=spaces)
 | 
			
		||||
 | 
			
		||||
    def pkuseg_update_user_dict(self, words, reset=False):
 | 
			
		||||
        if self.segmenter == "pkuseg":
 | 
			
		||||
    def pkuseg_update_user_dict(self, words: List[str], reset: bool = False):
 | 
			
		||||
        if self.segmenter == Segmenter.pkuseg:
 | 
			
		||||
            if reset:
 | 
			
		||||
                try:
 | 
			
		||||
                    import pkuseg
 | 
			
		||||
 | 
			
		||||
                    self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
 | 
			
		||||
                except ImportError:
 | 
			
		||||
                    if self.segmenter == "pkuseg":
 | 
			
		||||
                        msg = (
 | 
			
		||||
                            "pkuseg not installed: unable to reset pkuseg "
 | 
			
		||||
                            "user dict. Please " + _PKUSEG_INSTALL_MSG
 | 
			
		||||
                        )
 | 
			
		||||
                        raise ImportError(msg)
 | 
			
		||||
                    msg = (
 | 
			
		||||
                        "pkuseg not installed: unable to reset pkuseg "
 | 
			
		||||
                        "user dict. Please " + _PKUSEG_INSTALL_MSG
 | 
			
		||||
                    )
 | 
			
		||||
                    raise ImportError(msg)
 | 
			
		||||
            for word in words:
 | 
			
		||||
                self.pkuseg_seg.preprocesser.insert(word.strip(), "")
 | 
			
		||||
        else:
 | 
			
		||||
            warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
 | 
			
		||||
            warnings.warn(warn_msg)
 | 
			
		||||
 | 
			
		||||
    def _get_config(self):
 | 
			
		||||
        config = OrderedDict((("segmenter", self.segmenter),))
 | 
			
		||||
        return config
 | 
			
		||||
 | 
			
		||||
    def _set_config(self, config={}):
 | 
			
		||||
        self.configure_segmenter(config)
 | 
			
		||||
 | 
			
		||||
    def to_bytes(self, **kwargs):
 | 
			
		||||
        pkuseg_features_b = b""
 | 
			
		||||
        pkuseg_weights_b = b""
 | 
			
		||||
| 
						 | 
				
			
			@ -165,17 +174,11 @@ class ChineseTokenizer(DummyTokenizer):
 | 
			
		|||
                sorted(list(self.pkuseg_seg.postprocesser.common_words)),
 | 
			
		||||
                sorted(list(self.pkuseg_seg.postprocesser.other_words)),
 | 
			
		||||
            )
 | 
			
		||||
        serializers = OrderedDict(
 | 
			
		||||
            (
 | 
			
		||||
                ("cfg", lambda: srsly.json_dumps(self._get_config())),
 | 
			
		||||
                ("pkuseg_features", lambda: pkuseg_features_b),
 | 
			
		||||
                ("pkuseg_weights", lambda: pkuseg_weights_b),
 | 
			
		||||
                (
 | 
			
		||||
                    "pkuseg_processors",
 | 
			
		||||
                    lambda: srsly.msgpack_dumps(pkuseg_processors_data),
 | 
			
		||||
                ),
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
        serializers = {
 | 
			
		||||
            "pkuseg_features": lambda: pkuseg_features_b,
 | 
			
		||||
            "pkuseg_weights": lambda: pkuseg_weights_b,
 | 
			
		||||
            "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
 | 
			
		||||
        }
 | 
			
		||||
        return util.to_bytes(serializers, [])
 | 
			
		||||
 | 
			
		||||
    def from_bytes(self, data, **kwargs):
 | 
			
		||||
| 
						 | 
				
			
			@ -190,14 +193,11 @@ class ChineseTokenizer(DummyTokenizer):
 | 
			
		|||
        def deserialize_pkuseg_processors(b):
 | 
			
		||||
            pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
 | 
			
		||||
 | 
			
		||||
        deserializers = OrderedDict(
 | 
			
		||||
            (
 | 
			
		||||
                ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
 | 
			
		||||
                ("pkuseg_features", deserialize_pkuseg_features),
 | 
			
		||||
                ("pkuseg_weights", deserialize_pkuseg_weights),
 | 
			
		||||
                ("pkuseg_processors", deserialize_pkuseg_processors),
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
        deserializers = {
 | 
			
		||||
            "pkuseg_features": deserialize_pkuseg_features,
 | 
			
		||||
            "pkuseg_weights": deserialize_pkuseg_weights,
 | 
			
		||||
            "pkuseg_processors": deserialize_pkuseg_processors,
 | 
			
		||||
        }
 | 
			
		||||
        util.from_bytes(data, deserializers, [])
 | 
			
		||||
 | 
			
		||||
        if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
 | 
			
		||||
| 
						 | 
				
			
			@ -245,13 +245,10 @@ class ChineseTokenizer(DummyTokenizer):
 | 
			
		|||
                )
 | 
			
		||||
                srsly.write_msgpack(path, data)
 | 
			
		||||
 | 
			
		||||
        serializers = OrderedDict(
 | 
			
		||||
            (
 | 
			
		||||
                ("cfg", lambda p: srsly.write_json(p, self._get_config())),
 | 
			
		||||
                ("pkuseg_model", lambda p: save_pkuseg_model(p)),
 | 
			
		||||
                ("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
        serializers = {
 | 
			
		||||
            "pkuseg_model": lambda p: save_pkuseg_model(p),
 | 
			
		||||
            "pkuseg_processors": lambda p: save_pkuseg_processors(p),
 | 
			
		||||
        }
 | 
			
		||||
        return util.to_disk(path, serializers, [])
 | 
			
		||||
 | 
			
		||||
    def from_disk(self, path, **kwargs):
 | 
			
		||||
| 
						 | 
				
			
			@ -261,7 +258,7 @@ class ChineseTokenizer(DummyTokenizer):
 | 
			
		|||
            try:
 | 
			
		||||
                import pkuseg
 | 
			
		||||
            except ImportError:
 | 
			
		||||
                if self.segmenter == "pkuseg":
 | 
			
		||||
                if self.segmenter == Segmenter.pkuseg:
 | 
			
		||||
                    raise ImportError(
 | 
			
		||||
                        "pkuseg not installed. To use this model, "
 | 
			
		||||
                        + _PKUSEG_INSTALL_MSG
 | 
			
		||||
| 
						 | 
				
			
			@ -273,9 +270,9 @@ class ChineseTokenizer(DummyTokenizer):
 | 
			
		|||
            try:
 | 
			
		||||
                import pkuseg
 | 
			
		||||
            except ImportError:
 | 
			
		||||
                if self.segmenter == "pkuseg":
 | 
			
		||||
                if self.segmenter == Segmenter.pkuseg:
 | 
			
		||||
                    raise ImportError(self._pkuseg_install_msg)
 | 
			
		||||
            if self.segmenter == "pkuseg":
 | 
			
		||||
            if self.segmenter == Segmenter.pkuseg:
 | 
			
		||||
                data = srsly.read_msgpack(path)
 | 
			
		||||
                (user_dict, do_process, common_words, other_words) = data
 | 
			
		||||
                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
 | 
			
		||||
| 
						 | 
				
			
			@ -283,35 +280,64 @@ class ChineseTokenizer(DummyTokenizer):
 | 
			
		|||
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
 | 
			
		||||
                self.pkuseg_seg.postprocesser.other_words = set(other_words)
 | 
			
		||||
 | 
			
		||||
        serializers = OrderedDict(
 | 
			
		||||
            (
 | 
			
		||||
                ("cfg", lambda p: self._set_config(srsly.read_json(p))),
 | 
			
		||||
                ("pkuseg_model", lambda p: load_pkuseg_model(p)),
 | 
			
		||||
                ("pkuseg_processors", lambda p: load_pkuseg_processors(p)),
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
        serializers = {
 | 
			
		||||
            "pkuseg_model": lambda p: load_pkuseg_model(p),
 | 
			
		||||
            "pkuseg_processors": lambda p: load_pkuseg_processors(p),
 | 
			
		||||
        }
 | 
			
		||||
        util.from_disk(path, serializers, [])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ChineseDefaults(Language.Defaults):
 | 
			
		||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
			
		||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
			
		||||
    lex_attr_getters[LANG] = lambda text: "zh"
 | 
			
		||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def create_tokenizer(cls, nlp=None, config={}):
 | 
			
		||||
        return ChineseTokenizer(cls, nlp, config=config)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Chinese(Language):
 | 
			
		||||
    lang = "zh"
 | 
			
		||||
    Defaults = ChineseDefaults  # override defaults
 | 
			
		||||
    Defaults = ChineseDefaults
 | 
			
		||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
			
		||||
 | 
			
		||||
    def make_doc(self, text):
 | 
			
		||||
        return self.tokenizer(text)
 | 
			
		||||
 | 
			
		||||
def try_jieba_import(segmenter: str) -> None:
 | 
			
		||||
    try:
 | 
			
		||||
        import jieba
 | 
			
		||||
 | 
			
		||||
        if segmenter == Segmenter.jieba:
 | 
			
		||||
            # segment a short text to have jieba initialize its cache in advance
 | 
			
		||||
            list(jieba.cut("作为", cut_all=False))
 | 
			
		||||
 | 
			
		||||
        return jieba
 | 
			
		||||
    except ImportError:
 | 
			
		||||
        if segmenter == Segmenter.jieba:
 | 
			
		||||
            msg = (
 | 
			
		||||
                "Jieba not installed. To use jieba, install it with `pip "
 | 
			
		||||
                " install jieba` or from https://github.com/fxsjy/jieba"
 | 
			
		||||
            )
 | 
			
		||||
            raise ImportError(msg)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
 | 
			
		||||
    try:
 | 
			
		||||
        import pkuseg
 | 
			
		||||
 | 
			
		||||
        if pkuseg_model:
 | 
			
		||||
            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
 | 
			
		||||
        elif segmenter == Segmenter.pkuseg:
 | 
			
		||||
            msg = (
 | 
			
		||||
                "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
 | 
			
		||||
                "was specified. Please provide the name of a pretrained model "
 | 
			
		||||
                "or the path to a model with:\n"
 | 
			
		||||
                'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
 | 
			
		||||
                "nlp = Chinese.from_config(cfg)"
 | 
			
		||||
            )
 | 
			
		||||
            raise ValueError(msg)
 | 
			
		||||
    except ImportError:
 | 
			
		||||
        if segmenter == Segmenter.pkuseg:
 | 
			
		||||
            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
 | 
			
		||||
            raise ImportError(msg)
 | 
			
		||||
    except FileNotFoundError:
 | 
			
		||||
        if segmenter == Segmenter.pkuseg:
 | 
			
		||||
            msg = "Unable to load pkuseg model from: " + pkuseg_model
 | 
			
		||||
            raise FileNotFoundError(msg)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get_pkuseg_trie_data(node, path=""):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										1052
									
								
								spacy/language.py
									
									
									
									
									
								
							
							
						
						
									
										1052
									
								
								spacy/language.py
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| 
						 | 
				
			
			@ -1,5 +1,14 @@
 | 
			
		|||
from typing import Optional, Callable, List, Dict
 | 
			
		||||
 | 
			
		||||
from .lookups import Lookups
 | 
			
		||||
from .errors import Errors
 | 
			
		||||
from .parts_of_speech import NAMES as UPOS_NAMES
 | 
			
		||||
from .util import registry, load_language_data, SimpleFrozenDict
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.lemmatizers("spacy.Lemmatizer.v1")
 | 
			
		||||
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
 | 
			
		||||
    return Lemmatizer(data_paths=data_paths)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Lemmatizer:
 | 
			
		||||
| 
						 | 
				
			
			@ -14,17 +23,27 @@ class Lemmatizer:
 | 
			
		|||
    def load(cls, *args, **kwargs):
 | 
			
		||||
        raise NotImplementedError(Errors.E172)
 | 
			
		||||
 | 
			
		||||
    def __init__(self, lookups, is_base_form=None):
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        lookups: Optional[Lookups] = None,
 | 
			
		||||
        data_paths: dict = SimpleFrozenDict(),
 | 
			
		||||
        is_base_form: Optional[Callable] = None,
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        """Initialize a Lemmatizer.
 | 
			
		||||
 | 
			
		||||
        lookups (Lookups): The lookups object containing the (optional) tables
 | 
			
		||||
            "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
 | 
			
		||||
        RETURNS (Lemmatizer): The newly constructed object.
 | 
			
		||||
        """
 | 
			
		||||
        self.lookups = lookups
 | 
			
		||||
        self.lookups = lookups if lookups is not None else Lookups()
 | 
			
		||||
        for name, filename in data_paths.items():
 | 
			
		||||
            data = load_language_data(filename)
 | 
			
		||||
            self.lookups.add_table(name, data)
 | 
			
		||||
        self.is_base_form = is_base_form
 | 
			
		||||
 | 
			
		||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
			
		||||
    def __call__(
 | 
			
		||||
        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        """Lemmatize a string.
 | 
			
		||||
 | 
			
		||||
        string (str): The string to lemmatize, e.g. the token text.
 | 
			
		||||
| 
						 | 
				
			
			@ -39,7 +58,6 @@ class Lemmatizer:
 | 
			
		|||
        if isinstance(univ_pos, int):
 | 
			
		||||
            univ_pos = UPOS_NAMES.get(univ_pos, "X")
 | 
			
		||||
        univ_pos = univ_pos.lower()
 | 
			
		||||
 | 
			
		||||
        if univ_pos in ("", "eol", "space"):
 | 
			
		||||
            return [string.lower()]
 | 
			
		||||
        # See Issue #435 for example of where this logic is requied.
 | 
			
		||||
| 
						 | 
				
			
			@ -67,65 +85,31 @@ class Lemmatizer:
 | 
			
		|||
        )
 | 
			
		||||
        return lemmas
 | 
			
		||||
 | 
			
		||||
    def is_base_form(self, univ_pos, morphology=None):
 | 
			
		||||
        """
 | 
			
		||||
        Check whether we're dealing with an uninflected paradigm, so we can
 | 
			
		||||
        avoid lemmatization entirely.
 | 
			
		||||
 | 
			
		||||
        univ_pos (str / int): The token's universal part-of-speech tag.
 | 
			
		||||
        morphology (dict): The token's morphological features following the
 | 
			
		||||
            Universal Dependencies scheme.
 | 
			
		||||
        """
 | 
			
		||||
        if morphology is None:
 | 
			
		||||
            morphology = {}
 | 
			
		||||
        if univ_pos == "noun" and morphology.get("Number") == "sing":
 | 
			
		||||
            return True
 | 
			
		||||
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
 | 
			
		||||
            return True
 | 
			
		||||
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
 | 
			
		||||
        # morphology
 | 
			
		||||
        elif univ_pos == "verb" and (
 | 
			
		||||
            morphology.get("VerbForm") == "fin"
 | 
			
		||||
            and morphology.get("Tense") == "pres"
 | 
			
		||||
            and morphology.get("Number") is None
 | 
			
		||||
        ):
 | 
			
		||||
            return True
 | 
			
		||||
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
 | 
			
		||||
            return True
 | 
			
		||||
        elif morphology.get("VerbForm") == "inf":
 | 
			
		||||
            return True
 | 
			
		||||
        elif morphology.get("VerbForm") == "none":
 | 
			
		||||
            return True
 | 
			
		||||
        elif morphology.get("Degree") == "pos":
 | 
			
		||||
            return True
 | 
			
		||||
        else:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def noun(self, string, morphology=None):
 | 
			
		||||
    def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
			
		||||
        return self(string, "noun", morphology)
 | 
			
		||||
 | 
			
		||||
    def verb(self, string, morphology=None):
 | 
			
		||||
    def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
			
		||||
        return self(string, "verb", morphology)
 | 
			
		||||
 | 
			
		||||
    def adj(self, string, morphology=None):
 | 
			
		||||
    def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
			
		||||
        return self(string, "adj", morphology)
 | 
			
		||||
 | 
			
		||||
    def det(self, string, morphology=None):
 | 
			
		||||
    def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
			
		||||
        return self(string, "det", morphology)
 | 
			
		||||
 | 
			
		||||
    def pron(self, string, morphology=None):
 | 
			
		||||
    def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
			
		||||
        return self(string, "pron", morphology)
 | 
			
		||||
 | 
			
		||||
    def adp(self, string, morphology=None):
 | 
			
		||||
    def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
			
		||||
        return self(string, "adp", morphology)
 | 
			
		||||
 | 
			
		||||
    def num(self, string, morphology=None):
 | 
			
		||||
    def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
			
		||||
        return self(string, "num", morphology)
 | 
			
		||||
 | 
			
		||||
    def punct(self, string, morphology=None):
 | 
			
		||||
    def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
			
		||||
        return self(string, "punct", morphology)
 | 
			
		||||
 | 
			
		||||
    def lookup(self, string, orth=None):
 | 
			
		||||
    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
			
		||||
        """Look up a lemma in the table, if available. If no lemma is found,
 | 
			
		||||
        the original string is returned.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -141,7 +125,13 @@ class Lemmatizer:
 | 
			
		|||
            return lookup_table[key]
 | 
			
		||||
        return string
 | 
			
		||||
 | 
			
		||||
    def lemmatize(self, string, index, exceptions, rules):
 | 
			
		||||
    def lemmatize(
 | 
			
		||||
        self,
 | 
			
		||||
        string: str,
 | 
			
		||||
        index: Dict[str, List[str]],
 | 
			
		||||
        exceptions: Dict[str, Dict[str, List[str]]],
 | 
			
		||||
        rules: Dict[str, List[List[str]]],
 | 
			
		||||
    ) -> List[str]:
 | 
			
		||||
        orig = string
 | 
			
		||||
        string = string.lower()
 | 
			
		||||
        forms = []
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,15 +1,32 @@
 | 
			
		|||
from typing import Dict, Any, List, Union, Optional
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
import srsly
 | 
			
		||||
from preshed.bloom import BloomFilter
 | 
			
		||||
from collections import OrderedDict
 | 
			
		||||
 | 
			
		||||
from .errors import Errors
 | 
			
		||||
from .util import SimpleFrozenDict, ensure_path
 | 
			
		||||
from .util import SimpleFrozenDict, ensure_path, registry
 | 
			
		||||
from .strings import get_string_id
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
UNSET = object()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@registry.language_data("spacy-lookups-data")
 | 
			
		||||
def get_lookups(lang: str) -> Dict[str, Any]:
 | 
			
		||||
    """Load the data from the spacy-lookups-data package for a given language,
 | 
			
		||||
    if available. Returns an empty dict if there's no data or if the package
 | 
			
		||||
    is not installed.
 | 
			
		||||
 | 
			
		||||
    lang (str): The language code (corresponds to entry point exposed by
 | 
			
		||||
        the spacy-lookups-data package).
 | 
			
		||||
    RETURNS (Dict[str, Any]): The lookups, keyed by table name.
 | 
			
		||||
    """
 | 
			
		||||
    if lang in registry.lookups:
 | 
			
		||||
        return registry.lookups.get(lang)
 | 
			
		||||
    return {}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Lookups:
 | 
			
		||||
    """Container for large lookup tables and dictionaries, e.g. lemmatization
 | 
			
		||||
    data or tokenizer exception lists. Lookups are available via vocab.lookups,
 | 
			
		||||
| 
						 | 
				
			
			@ -18,7 +35,7 @@ class Lookups:
 | 
			
		|||
    via doc.vocab.lookups.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
    def __init__(self) -> None:
 | 
			
		||||
        """Initialize the Lookups object.
 | 
			
		||||
 | 
			
		||||
        RETURNS (Lookups): The newly created object.
 | 
			
		||||
| 
						 | 
				
			
			@ -27,7 +44,7 @@ class Lookups:
 | 
			
		|||
        """
 | 
			
		||||
        self._tables = {}
 | 
			
		||||
 | 
			
		||||
    def __contains__(self, name):
 | 
			
		||||
    def __contains__(self, name: str) -> bool:
 | 
			
		||||
        """Check if the lookups contain a table of a given name. Delegates to
 | 
			
		||||
        Lookups.has_table.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -36,16 +53,16 @@ class Lookups:
 | 
			
		|||
        """
 | 
			
		||||
        return self.has_table(name)
 | 
			
		||||
 | 
			
		||||
    def __len__(self):
 | 
			
		||||
    def __len__(self) -> int:
 | 
			
		||||
        """RETURNS (int): The number of tables in the lookups."""
 | 
			
		||||
        return len(self._tables)
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def tables(self):
 | 
			
		||||
        """RETURNS (list): Names of all tables in the lookups."""
 | 
			
		||||
    def tables(self) -> List[str]:
 | 
			
		||||
        """RETURNS (List[str]): Names of all tables in the lookups."""
 | 
			
		||||
        return list(self._tables.keys())
 | 
			
		||||
 | 
			
		||||
    def add_table(self, name, data=SimpleFrozenDict()):
 | 
			
		||||
    def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
 | 
			
		||||
        """Add a new table to the lookups. Raises an error if the table exists.
 | 
			
		||||
 | 
			
		||||
        name (str): Unique name of table.
 | 
			
		||||
| 
						 | 
				
			
			@ -60,12 +77,12 @@ class Lookups:
 | 
			
		|||
        self._tables[name] = table
 | 
			
		||||
        return table
 | 
			
		||||
 | 
			
		||||
    def get_table(self, name, default=UNSET):
 | 
			
		||||
    def get_table(self, name: str, default: Any = UNSET) -> "Table":
 | 
			
		||||
        """Get a table. Raises an error if the table doesn't exist and no
 | 
			
		||||
        default value is provided.
 | 
			
		||||
 | 
			
		||||
        name (str): Name of the table.
 | 
			
		||||
        default: Optional default value to return if table doesn't exist.
 | 
			
		||||
        default (Any): Optional default value to return if table doesn't exist.
 | 
			
		||||
        RETURNS (Table): The table.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/lookups#get_table
 | 
			
		||||
| 
						 | 
				
			
			@ -76,7 +93,7 @@ class Lookups:
 | 
			
		|||
            return default
 | 
			
		||||
        return self._tables[name]
 | 
			
		||||
 | 
			
		||||
    def remove_table(self, name):
 | 
			
		||||
    def remove_table(self, name: str) -> "Table":
 | 
			
		||||
        """Remove a table. Raises an error if the table doesn't exist.
 | 
			
		||||
 | 
			
		||||
        name (str): Name of the table to remove.
 | 
			
		||||
| 
						 | 
				
			
			@ -88,7 +105,7 @@ class Lookups:
 | 
			
		|||
            raise KeyError(Errors.E159.format(name=name, tables=self.tables))
 | 
			
		||||
        return self._tables.pop(name)
 | 
			
		||||
 | 
			
		||||
    def has_table(self, name):
 | 
			
		||||
    def has_table(self, name: str) -> bool:
 | 
			
		||||
        """Check if the lookups contain a table of a given name.
 | 
			
		||||
 | 
			
		||||
        name (str): Name of the table.
 | 
			
		||||
| 
						 | 
				
			
			@ -98,7 +115,7 @@ class Lookups:
 | 
			
		|||
        """
 | 
			
		||||
        return name in self._tables
 | 
			
		||||
 | 
			
		||||
    def to_bytes(self, **kwargs):
 | 
			
		||||
    def to_bytes(self, **kwargs) -> bytes:
 | 
			
		||||
        """Serialize the lookups to a bytestring.
 | 
			
		||||
 | 
			
		||||
        RETURNS (bytes): The serialized Lookups.
 | 
			
		||||
| 
						 | 
				
			
			@ -107,7 +124,7 @@ class Lookups:
 | 
			
		|||
        """
 | 
			
		||||
        return srsly.msgpack_dumps(self._tables)
 | 
			
		||||
 | 
			
		||||
    def from_bytes(self, bytes_data, **kwargs):
 | 
			
		||||
    def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
 | 
			
		||||
        """Load the lookups from a bytestring.
 | 
			
		||||
 | 
			
		||||
        bytes_data (bytes): The data to load.
 | 
			
		||||
| 
						 | 
				
			
			@ -120,7 +137,9 @@ class Lookups:
 | 
			
		|||
            self._tables[key] = Table(key, value)
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def to_disk(self, path, filename="lookups.bin", **kwargs):
 | 
			
		||||
    def to_disk(
 | 
			
		||||
        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        """Save the lookups to a directory as lookups.bin. Expects a path to a
 | 
			
		||||
        directory, which will be created if it doesn't exist.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -136,7 +155,9 @@ class Lookups:
 | 
			
		|||
            with filepath.open("wb") as file_:
 | 
			
		||||
                file_.write(self.to_bytes())
 | 
			
		||||
 | 
			
		||||
    def from_disk(self, path, filename="lookups.bin", **kwargs):
 | 
			
		||||
    def from_disk(
 | 
			
		||||
        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
 | 
			
		||||
    ) -> "Lookups":
 | 
			
		||||
        """Load lookups from a directory containing a lookups.bin. Will skip
 | 
			
		||||
        loading if the file doesn't exist.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -162,7 +183,7 @@ class Table(OrderedDict):
 | 
			
		|||
    """
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def from_dict(cls, data, name=None):
 | 
			
		||||
    def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
 | 
			
		||||
        """Initialize a new table from a dict.
 | 
			
		||||
 | 
			
		||||
        data (dict): The dictionary.
 | 
			
		||||
| 
						 | 
				
			
			@ -175,7 +196,7 @@ class Table(OrderedDict):
 | 
			
		|||
        self.update(data)
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def __init__(self, name=None, data=None):
 | 
			
		||||
    def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
 | 
			
		||||
        """Initialize a new table.
 | 
			
		||||
 | 
			
		||||
        name (str): Optional table name for reference.
 | 
			
		||||
| 
						 | 
				
			
			@ -193,7 +214,7 @@ class Table(OrderedDict):
 | 
			
		|||
        if data:
 | 
			
		||||
            self.update(data)
 | 
			
		||||
 | 
			
		||||
    def __setitem__(self, key, value):
 | 
			
		||||
    def __setitem__(self, key: Union[str, int], value: Any) -> None:
 | 
			
		||||
        """Set new key/value pair. String keys will be hashed.
 | 
			
		||||
 | 
			
		||||
        key (str / int): The key to set.
 | 
			
		||||
| 
						 | 
				
			
			@ -203,7 +224,7 @@ class Table(OrderedDict):
 | 
			
		|||
        OrderedDict.__setitem__(self, key, value)
 | 
			
		||||
        self.bloom.add(key)
 | 
			
		||||
 | 
			
		||||
    def set(self, key, value):
 | 
			
		||||
    def set(self, key: Union[str, int], value: Any) -> None:
 | 
			
		||||
        """Set new key/value pair. String keys will be hashed.
 | 
			
		||||
        Same as table[key] = value.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -212,7 +233,7 @@ class Table(OrderedDict):
 | 
			
		|||
        """
 | 
			
		||||
        self[key] = value
 | 
			
		||||
 | 
			
		||||
    def __getitem__(self, key):
 | 
			
		||||
    def __getitem__(self, key: Union[str, int]) -> Any:
 | 
			
		||||
        """Get the value for a given key. String keys will be hashed.
 | 
			
		||||
 | 
			
		||||
        key (str / int): The key to get.
 | 
			
		||||
| 
						 | 
				
			
			@ -221,7 +242,7 @@ class Table(OrderedDict):
 | 
			
		|||
        key = get_string_id(key)
 | 
			
		||||
        return OrderedDict.__getitem__(self, key)
 | 
			
		||||
 | 
			
		||||
    def get(self, key, default=None):
 | 
			
		||||
    def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
 | 
			
		||||
        """Get the value for a given key. String keys will be hashed.
 | 
			
		||||
 | 
			
		||||
        key (str / int): The key to get.
 | 
			
		||||
| 
						 | 
				
			
			@ -231,7 +252,7 @@ class Table(OrderedDict):
 | 
			
		|||
        key = get_string_id(key)
 | 
			
		||||
        return OrderedDict.get(self, key, default)
 | 
			
		||||
 | 
			
		||||
    def __contains__(self, key):
 | 
			
		||||
    def __contains__(self, key: Union[str, int]) -> bool:
 | 
			
		||||
        """Check whether a key is in the table. String keys will be hashed.
 | 
			
		||||
 | 
			
		||||
        key (str / int): The key to check.
 | 
			
		||||
| 
						 | 
				
			
			@ -243,7 +264,7 @@ class Table(OrderedDict):
 | 
			
		|||
            return False
 | 
			
		||||
        return OrderedDict.__contains__(self, key)
 | 
			
		||||
 | 
			
		||||
    def to_bytes(self):
 | 
			
		||||
    def to_bytes(self) -> bytes:
 | 
			
		||||
        """Serialize table to a bytestring.
 | 
			
		||||
 | 
			
		||||
        RETURNS (bytes): The serialized table.
 | 
			
		||||
| 
						 | 
				
			
			@ -257,7 +278,7 @@ class Table(OrderedDict):
 | 
			
		|||
        }
 | 
			
		||||
        return srsly.msgpack_dumps(data)
 | 
			
		||||
 | 
			
		||||
    def from_bytes(self, bytes_data):
 | 
			
		||||
    def from_bytes(self, bytes_data: bytes) -> "Table":
 | 
			
		||||
        """Load a table from a bytestring.
 | 
			
		||||
 | 
			
		||||
        bytes_data (bytes): The data to load.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -26,8 +26,8 @@ def build_nel_encoder(tok2vec, nO=None):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
@registry.assets.register("spacy.KBFromFile.v1")
 | 
			
		||||
def load_kb(nlp_path, kb_path) -> KnowledgeBase:
 | 
			
		||||
    vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
 | 
			
		||||
def load_kb(vocab_path, kb_path) -> KnowledgeBase:
 | 
			
		||||
    vocab = Vocab().from_disk(vocab_path)
 | 
			
		||||
    kb = KnowledgeBase(vocab=vocab)
 | 
			
		||||
    kb.load_bulk(kb_path)
 | 
			
		||||
    return kb
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,30 +1,9 @@
 | 
			
		|||
from thinc.api import (
 | 
			
		||||
    Model,
 | 
			
		||||
    reduce_mean,
 | 
			
		||||
    Linear,
 | 
			
		||||
    list2ragged,
 | 
			
		||||
    Logistic,
 | 
			
		||||
    ParametricAttention,
 | 
			
		||||
)
 | 
			
		||||
from thinc.api import chain, concatenate, clone, Dropout
 | 
			
		||||
from thinc.api import (
 | 
			
		||||
    SparseLinear,
 | 
			
		||||
    Softmax,
 | 
			
		||||
    softmax_activation,
 | 
			
		||||
    Maxout,
 | 
			
		||||
    reduce_sum,
 | 
			
		||||
    Relu,
 | 
			
		||||
    residual,
 | 
			
		||||
    expand_window,
 | 
			
		||||
)
 | 
			
		||||
from thinc.api import (
 | 
			
		||||
    HashEmbed,
 | 
			
		||||
    with_ragged,
 | 
			
		||||
    with_array,
 | 
			
		||||
    with_cpu,
 | 
			
		||||
    uniqued,
 | 
			
		||||
    FeatureExtractor,
 | 
			
		||||
)
 | 
			
		||||
from typing import Optional
 | 
			
		||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 | 
			
		||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 | 
			
		||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
 | 
			
		||||
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
 | 
			
		||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
 | 
			
		||||
 | 
			
		||||
from ..spacy_vectors import SpacyVectors
 | 
			
		||||
from ... import util
 | 
			
		||||
| 
						 | 
				
			
			@ -34,7 +13,9 @@ from ..extract_ngrams import extract_ngrams
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
@registry.architectures.register("spacy.TextCatCNN.v1")
 | 
			
		||||
def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
 | 
			
		||||
def build_simple_cnn_text_classifier(
 | 
			
		||||
    tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
 | 
			
		||||
) -> Model:
 | 
			
		||||
    """
 | 
			
		||||
    Build a simple CNN text classifier, given a token-to-vector model as inputs.
 | 
			
		||||
    If exclusive_classes=True, a softmax non-linearity is applied, so that the
 | 
			
		||||
| 
						 | 
				
			
			@ -90,13 +71,25 @@ def build_text_classifier(
 | 
			
		|||
            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
 | 
			
		||||
        )
 | 
			
		||||
        prefix = HashEmbed(
 | 
			
		||||
            nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
 | 
			
		||||
            nO=width // 2,
 | 
			
		||||
            nV=embed_size,
 | 
			
		||||
            column=cols.index(PREFIX),
 | 
			
		||||
            dropout=dropout,
 | 
			
		||||
            seed=11,
 | 
			
		||||
        )
 | 
			
		||||
        suffix = HashEmbed(
 | 
			
		||||
            nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
 | 
			
		||||
            nO=width // 2,
 | 
			
		||||
            nV=embed_size,
 | 
			
		||||
            column=cols.index(SUFFIX),
 | 
			
		||||
            dropout=dropout,
 | 
			
		||||
            seed=12,
 | 
			
		||||
        )
 | 
			
		||||
        shape = HashEmbed(
 | 
			
		||||
            nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
 | 
			
		||||
            nO=width // 2,
 | 
			
		||||
            nV=embed_size,
 | 
			
		||||
            column=cols.index(SHAPE),
 | 
			
		||||
            dropout=dropout,
 | 
			
		||||
            seed=13,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,8 +11,8 @@ from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
@registry.architectures.register("spacy.Tok2VecTensors.v1")
 | 
			
		||||
def tok2vec_tensors_v1(width):
 | 
			
		||||
    tok2vec = Tok2VecListener("tok2vec", width=width)
 | 
			
		||||
def tok2vec_tensors_v1(width, upstream="*"):
 | 
			
		||||
    tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
 | 
			
		||||
    return tok2vec
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,30 +1,37 @@
 | 
			
		|||
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
 | 
			
		||||
from wasabi import Printer
 | 
			
		||||
import warnings
 | 
			
		||||
 | 
			
		||||
from .tokens import Doc, Token, Span
 | 
			
		||||
from .errors import Errors, Warnings
 | 
			
		||||
from .util import dot_to_dict
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    # This lets us add type hints for mypy etc. without causing circular imports
 | 
			
		||||
    from .language import Language  # noqa: F401
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def analyze_pipes(pipeline, name, pipe, index, warn=True):
 | 
			
		||||
def analyze_pipes(
 | 
			
		||||
    nlp: "Language", name: str, index: int, warn: bool = True
 | 
			
		||||
) -> List[str]:
 | 
			
		||||
    """Analyze a pipeline component with respect to its position in the current
 | 
			
		||||
    pipeline and the other components. Will check whether requirements are
 | 
			
		||||
    fulfilled (e.g. if previous components assign the attributes).
 | 
			
		||||
 | 
			
		||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
			
		||||
    nlp (Language): The current nlp object.
 | 
			
		||||
    name (str): The name of the pipeline component to analyze.
 | 
			
		||||
    pipe (callable): The pipeline component function to analyze.
 | 
			
		||||
    index (int): The index of the component in the pipeline.
 | 
			
		||||
    warn (bool): Show user warning if problem is found.
 | 
			
		||||
    RETURNS (list): The problems found for the given pipeline component.
 | 
			
		||||
    RETURNS (List[str]): The problems found for the given pipeline component.
 | 
			
		||||
    """
 | 
			
		||||
    assert pipeline[index][0] == name
 | 
			
		||||
    prev_pipes = pipeline[:index]
 | 
			
		||||
    pipe_requires = getattr(pipe, "requires", [])
 | 
			
		||||
    requires = {annot: False for annot in pipe_requires}
 | 
			
		||||
    assert nlp.pipeline[index][0] == name
 | 
			
		||||
    prev_pipes = nlp.pipeline[:index]
 | 
			
		||||
    meta = nlp.get_pipe_meta(name)
 | 
			
		||||
    requires = {annot: False for annot in meta.requires}
 | 
			
		||||
    if requires:
 | 
			
		||||
        for prev_name, prev_pipe in prev_pipes:
 | 
			
		||||
            prev_assigns = getattr(prev_pipe, "assigns", [])
 | 
			
		||||
            for annot in prev_assigns:
 | 
			
		||||
            prev_meta = nlp.get_pipe_meta(prev_name)
 | 
			
		||||
            for annot in prev_meta.assigns:
 | 
			
		||||
                requires[annot] = True
 | 
			
		||||
    problems = []
 | 
			
		||||
    for annot, fulfilled in requires.items():
 | 
			
		||||
| 
						 | 
				
			
			@ -35,46 +42,29 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
 | 
			
		|||
    return problems
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def analyze_all_pipes(pipeline, warn=True):
 | 
			
		||||
def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
 | 
			
		||||
    """Analyze all pipes in the pipeline in order.
 | 
			
		||||
 | 
			
		||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
			
		||||
    nlp (Language): The current nlp object.
 | 
			
		||||
    warn (bool): Show user warning if problem is found.
 | 
			
		||||
    RETURNS (dict): The problems found, keyed by component name.
 | 
			
		||||
    RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
 | 
			
		||||
    """
 | 
			
		||||
    problems = {}
 | 
			
		||||
    for i, (name, pipe) in enumerate(pipeline):
 | 
			
		||||
        problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
 | 
			
		||||
    for i, name in enumerate(nlp.pipe_names):
 | 
			
		||||
        problems[name] = analyze_pipes(nlp, name, i, warn=warn)
 | 
			
		||||
    return problems
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def dot_to_dict(values):
 | 
			
		||||
    """Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
 | 
			
		||||
    become {"token": {"pos": True, "_": {"xyz": True }}}.
 | 
			
		||||
 | 
			
		||||
    values (iterable): The values to convert.
 | 
			
		||||
    RETURNS (dict): The converted values.
 | 
			
		||||
    """
 | 
			
		||||
    result = {}
 | 
			
		||||
    for value in values:
 | 
			
		||||
        path = result
 | 
			
		||||
        parts = value.lower().split(".")
 | 
			
		||||
        for i, item in enumerate(parts):
 | 
			
		||||
            is_last = i == len(parts) - 1
 | 
			
		||||
            path = path.setdefault(item, True if is_last else {})
 | 
			
		||||
    return result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def validate_attrs(values):
 | 
			
		||||
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
 | 
			
		||||
    """Validate component attributes provided to "assigns", "requires" etc.
 | 
			
		||||
    Raises error for invalid attributes and formatting. Doesn't check if
 | 
			
		||||
    custom extension attributes are registered, since this is something the
 | 
			
		||||
    user might want to do themselves later in the component.
 | 
			
		||||
 | 
			
		||||
    values (iterable): The string attributes to check, e.g. `["token.pos"]`.
 | 
			
		||||
    RETURNS (iterable): The checked attributes.
 | 
			
		||||
    values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`.
 | 
			
		||||
    RETURNS (Iterable[str]): The checked attributes.
 | 
			
		||||
    """
 | 
			
		||||
    data = dot_to_dict(values)
 | 
			
		||||
    data = dot_to_dict({value: True for value in values})
 | 
			
		||||
    objs = {"doc": Doc, "token": Token, "span": Span}
 | 
			
		||||
    for obj_key, attrs in data.items():
 | 
			
		||||
        if obj_key == "span":
 | 
			
		||||
| 
						 | 
				
			
			@ -111,37 +101,40 @@ def validate_attrs(values):
 | 
			
		|||
    return values
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get_feature_for_attr(pipeline, attr, feature):
 | 
			
		||||
def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
 | 
			
		||||
    assert feature in ["assigns", "requires"]
 | 
			
		||||
    result = []
 | 
			
		||||
    for pipe_name, pipe in pipeline:
 | 
			
		||||
        pipe_assigns = getattr(pipe, feature, [])
 | 
			
		||||
    for pipe_name in nlp.pipe_names:
 | 
			
		||||
        meta = nlp.get_pipe_meta(pipe_name)
 | 
			
		||||
        pipe_assigns = getattr(meta, feature, [])
 | 
			
		||||
        if attr in pipe_assigns:
 | 
			
		||||
            result.append((pipe_name, pipe))
 | 
			
		||||
            result.append(pipe_name)
 | 
			
		||||
    return result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_assigns_for_attr(pipeline, attr):
 | 
			
		||||
def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
 | 
			
		||||
    """Get all pipeline components that assign an attr, e.g. "doc.tensor".
 | 
			
		||||
 | 
			
		||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
			
		||||
    pipeline (Language): The current nlp object.
 | 
			
		||||
    attr (str): The attribute to check.
 | 
			
		||||
    RETURNS (list): (name, pipeline) tuples of components that assign the attr.
 | 
			
		||||
    RETURNS (List[str]): Names of components that require the attr.
 | 
			
		||||
    """
 | 
			
		||||
    return _get_feature_for_attr(pipeline, attr, "assigns")
 | 
			
		||||
    return _get_feature_for_attr(nlp, attr, "assigns")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_requires_for_attr(pipeline, attr):
 | 
			
		||||
def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
 | 
			
		||||
    """Get all pipeline components that require an attr, e.g. "doc.tensor".
 | 
			
		||||
 | 
			
		||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
			
		||||
    pipeline (Language): The current nlp object.
 | 
			
		||||
    attr (str): The attribute to check.
 | 
			
		||||
    RETURNS (list): (name, pipeline) tuples of components that require the attr.
 | 
			
		||||
    RETURNS (List[str]): Names of components that require the attr.
 | 
			
		||||
    """
 | 
			
		||||
    return _get_feature_for_attr(pipeline, attr, "requires")
 | 
			
		||||
    return _get_feature_for_attr(nlp, attr, "requires")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def print_summary(nlp, pretty=True, no_print=False):
 | 
			
		||||
def print_summary(
 | 
			
		||||
    nlp: "Language", pretty: bool = True, no_print: bool = False
 | 
			
		||||
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
 | 
			
		||||
    """Print a formatted summary for the current nlp object's pipeline. Shows
 | 
			
		||||
    a table with the pipeline components and why they assign and require, as
 | 
			
		||||
    well as any problems if available.
 | 
			
		||||
| 
						 | 
				
			
			@ -154,12 +147,10 @@ def print_summary(nlp, pretty=True, no_print=False):
 | 
			
		|||
    msg = Printer(pretty=pretty, no_print=no_print)
 | 
			
		||||
    overview = []
 | 
			
		||||
    problems = {}
 | 
			
		||||
    for i, (name, pipe) in enumerate(nlp.pipeline):
 | 
			
		||||
        requires = getattr(pipe, "requires", [])
 | 
			
		||||
        assigns = getattr(pipe, "assigns", [])
 | 
			
		||||
        retok = getattr(pipe, "retokenizes", False)
 | 
			
		||||
        overview.append((i, name, requires, assigns, retok))
 | 
			
		||||
        problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
 | 
			
		||||
    for i, name in enumerate(nlp.pipe_names):
 | 
			
		||||
        meta = nlp.get_pipe_meta(name)
 | 
			
		||||
        overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
 | 
			
		||||
        problems[name] = analyze_pipes(nlp, name, i, warn=False)
 | 
			
		||||
    msg.divider("Pipeline Overview")
 | 
			
		||||
    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
 | 
			
		||||
    msg.table(overview, header=header, divider=True, multiline=True)
 | 
			
		||||
| 
						 | 
				
			
			@ -175,15 +166,19 @@ def print_summary(nlp, pretty=True, no_print=False):
 | 
			
		|||
        return {"overview": overview, "problems": problems}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def count_pipeline_interdependencies(pipeline):
 | 
			
		||||
def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
 | 
			
		||||
    """Count how many subsequent components require an annotation set by each
 | 
			
		||||
    component in the pipeline.
 | 
			
		||||
 | 
			
		||||
    nlp (Language): The current nlp object.
 | 
			
		||||
    RETURNS (List[int]): The interdependency counts.
 | 
			
		||||
    """
 | 
			
		||||
    pipe_assigns = []
 | 
			
		||||
    pipe_requires = []
 | 
			
		||||
    for name, pipe in pipeline:
 | 
			
		||||
        pipe_assigns.append(set(getattr(pipe, "assigns", [])))
 | 
			
		||||
        pipe_requires.append(set(getattr(pipe, "requires", [])))
 | 
			
		||||
    for name in nlp.pipe_names:
 | 
			
		||||
        meta = nlp.get_pipe_meta(name)
 | 
			
		||||
        pipe_assigns.append(set(meta.assigns))
 | 
			
		||||
        pipe_requires.append(set(meta.requires))
 | 
			
		||||
    counts = []
 | 
			
		||||
    for i, assigns in enumerate(pipe_assigns):
 | 
			
		||||
        count = 0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,28 +1,33 @@
 | 
			
		|||
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
 | 
			
		||||
from .pipes import TextCategorizer, Pipe, Sentencizer
 | 
			
		||||
from .pipes import SentenceRecognizer
 | 
			
		||||
from .simple_ner import SimpleNER
 | 
			
		||||
from .morphologizer import Morphologizer
 | 
			
		||||
from .dep_parser import DependencyParser
 | 
			
		||||
from .entity_linker import EntityLinker
 | 
			
		||||
from .ner import EntityRecognizer
 | 
			
		||||
from .entityruler import EntityRuler
 | 
			
		||||
from .morphologizer import Morphologizer
 | 
			
		||||
from .pipe import Pipe
 | 
			
		||||
from spacy.pipeline.senter import SentenceRecognizer
 | 
			
		||||
from .sentencizer import Sentencizer
 | 
			
		||||
from .simple_ner import SimpleNER
 | 
			
		||||
from .tagger import Tagger
 | 
			
		||||
from .textcat import TextCategorizer
 | 
			
		||||
from .tok2vec import Tok2Vec
 | 
			
		||||
from .hooks import SentenceSegmenter, SimilarityHook
 | 
			
		||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
 | 
			
		||||
 | 
			
		||||
__all__ = [
 | 
			
		||||
    "Tagger",
 | 
			
		||||
    "DependencyParser",
 | 
			
		||||
    "EntityRecognizer",
 | 
			
		||||
    "EntityLinker",
 | 
			
		||||
    "TextCategorizer",
 | 
			
		||||
    "Tok2Vec",
 | 
			
		||||
    "Pipe",
 | 
			
		||||
    "Morphologizer",
 | 
			
		||||
    "EntityRecognizer",
 | 
			
		||||
    "EntityRuler",
 | 
			
		||||
    "Sentencizer",
 | 
			
		||||
    "SentenceSegmenter",
 | 
			
		||||
    "Morphologizer",
 | 
			
		||||
    "Pipe",
 | 
			
		||||
    "SentenceRecognizer",
 | 
			
		||||
    "SentenceSegmenter",
 | 
			
		||||
    "Sentencizer",
 | 
			
		||||
    "SimilarityHook",
 | 
			
		||||
    "SimpleNER",
 | 
			
		||||
    "Tagger",
 | 
			
		||||
    "TextCategorizer",
 | 
			
		||||
    "Tok2Vec",
 | 
			
		||||
    "merge_entities",
 | 
			
		||||
    "merge_noun_chunks",
 | 
			
		||||
    "merge_subtokens",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,93 +0,0 @@
 | 
			
		|||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from ... import util
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_nel_config():
 | 
			
		||||
    loc = Path(__file__).parent / "entity_linker_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_nel():
 | 
			
		||||
    loc = Path(__file__).parent / "entity_linker_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_morphologizer_config():
 | 
			
		||||
    loc = Path(__file__).parent / "morphologizer_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_morphologizer():
 | 
			
		||||
    loc = Path(__file__).parent / "morphologizer_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_parser_config():
 | 
			
		||||
    loc = Path(__file__).parent / "parser_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_parser():
 | 
			
		||||
    loc = Path(__file__).parent / "parser_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_ner_config():
 | 
			
		||||
    loc = Path(__file__).parent / "ner_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_ner():
 | 
			
		||||
    loc = Path(__file__).parent / "ner_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_senter_config():
 | 
			
		||||
    loc = Path(__file__).parent / "senter_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_senter():
 | 
			
		||||
    loc = Path(__file__).parent / "senter_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_tagger_config():
 | 
			
		||||
    loc = Path(__file__).parent / "tagger_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_tagger():
 | 
			
		||||
    loc = Path(__file__).parent / "tagger_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_textcat_config():
 | 
			
		||||
    loc = Path(__file__).parent / "textcat_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_textcat():
 | 
			
		||||
    loc = Path(__file__).parent / "textcat_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_tok2vec_config():
 | 
			
		||||
    loc = Path(__file__).parent / "tok2vec_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_tok2vec():
 | 
			
		||||
    loc = Path(__file__).parent / "tok2vec_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_simple_ner_config():
 | 
			
		||||
    loc = Path(__file__).parent / "simple_ner_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def default_simple_ner():
 | 
			
		||||
    loc = Path(__file__).parent / "simple_ner_defaults.cfg"
 | 
			
		||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
			
		||||
| 
						 | 
				
			
			@ -1,13 +0,0 @@
 | 
			
		|||
[model]
 | 
			
		||||
@architectures = "spacy.EntityLinker.v1"
 | 
			
		||||
 | 
			
		||||
[model.tok2vec]
 | 
			
		||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
			
		||||
pretrained_vectors = null
 | 
			
		||||
width = 96
 | 
			
		||||
depth = 2
 | 
			
		||||
embed_size = 300
 | 
			
		||||
window_size = 1
 | 
			
		||||
maxout_pieces = 3
 | 
			
		||||
subword_features = true
 | 
			
		||||
dropout = null
 | 
			
		||||
| 
						 | 
				
			
			@ -1,14 +0,0 @@
 | 
			
		|||
[model]
 | 
			
		||||
@architectures = "spacy.Tagger.v1"
 | 
			
		||||
 | 
			
		||||
[model.tok2vec]
 | 
			
		||||
@architectures = "spacy.HashCharEmbedCNN.v1"
 | 
			
		||||
pretrained_vectors = null
 | 
			
		||||
width = 128
 | 
			
		||||
depth = 4
 | 
			
		||||
embed_size = 7000
 | 
			
		||||
window_size = 1
 | 
			
		||||
maxout_pieces = 3
 | 
			
		||||
nM = 64
 | 
			
		||||
nC = 8
 | 
			
		||||
dropout = null
 | 
			
		||||
| 
						 | 
				
			
			@ -1,15 +0,0 @@
 | 
			
		|||
[model]
 | 
			
		||||
@architectures = "spacy.MultiTask.v1"
 | 
			
		||||
maxout_pieces = 3
 | 
			
		||||
token_vector_width = 96
 | 
			
		||||
 | 
			
		||||
[model.tok2vec]
 | 
			
		||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
			
		||||
pretrained_vectors = null
 | 
			
		||||
width = 96
 | 
			
		||||
depth = 4
 | 
			
		||||
embed_size = 2000
 | 
			
		||||
window_size = 1
 | 
			
		||||
maxout_pieces = 2
 | 
			
		||||
subword_features = true
 | 
			
		||||
dropout = null
 | 
			
		||||
| 
						 | 
				
			
			@ -1,16 +0,0 @@
 | 
			
		|||
[model]
 | 
			
		||||
@architectures = "spacy.TransitionBasedParser.v1"
 | 
			
		||||
nr_feature_tokens = 6
 | 
			
		||||
hidden_width = 64
 | 
			
		||||
maxout_pieces = 2
 | 
			
		||||
 | 
			
		||||
[model.tok2vec]
 | 
			
		||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
			
		||||
pretrained_vectors = null
 | 
			
		||||
width = 96
 | 
			
		||||
depth = 4
 | 
			
		||||
embed_size = 2000
 | 
			
		||||
window_size = 1
 | 
			
		||||
maxout_pieces = 3
 | 
			
		||||
subword_features = true
 | 
			
		||||
dropout = null
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user