mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Refactor pipeline components, config and language data (#5759)
* Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
		
							parent
							
								
									311d0bde29
								
							
						
					
					
						commit
						43b960c01b
					
				| 
						 | 
					@ -17,7 +17,6 @@ import plac
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
from spacy.kb import KnowledgeBase
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.gold import Example
 | 
					from spacy.gold import Example
 | 
				
			||||||
from spacy.pipeline import EntityRuler
 | 
					from spacy.pipeline import EntityRuler
 | 
				
			||||||
| 
						 | 
					@ -82,12 +81,16 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create the Entity Linker component and add it to the pipeline.
 | 
					    # Create the Entity Linker component and add it to the pipeline.
 | 
				
			||||||
    if "entity_linker" not in nlp.pipe_names:
 | 
					    if "entity_linker" not in nlp.pipe_names:
 | 
				
			||||||
        kb = KnowledgeBase(vocab=nlp.vocab)
 | 
					        print("Loading Knowledge Base from '%s'" % kb_path)
 | 
				
			||||||
        kb.load_bulk(kb_path)
 | 
					        cfg = {
 | 
				
			||||||
        print("Loaded Knowledge Base from '%s'" % kb_path)
 | 
					            "kb": {
 | 
				
			||||||
 | 
					                "@assets": "spacy.KBFromFile.v1",
 | 
				
			||||||
 | 
					                "vocab_path": vocab_path,
 | 
				
			||||||
 | 
					                "kb_path": kb_path,
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
            # use only the predicted EL score and not the prior probability (for demo purposes)
 | 
					            # use only the predicted EL score and not the prior probability (for demo purposes)
 | 
				
			||||||
        cfg = {"kb": kb, "incl_prior": False}
 | 
					            "incl_prior": False,
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
        entity_linker = nlp.create_pipe("entity_linker", cfg)
 | 
					        entity_linker = nlp.create_pipe("entity_linker", cfg)
 | 
				
			||||||
        nlp.add_pipe(entity_linker, last=True)
 | 
					        nlp.add_pipe(entity_linker, last=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,7 @@ requires = [
 | 
				
			||||||
    "cymem>=2.0.2,<2.1.0",
 | 
					    "cymem>=2.0.2,<2.1.0",
 | 
				
			||||||
    "preshed>=3.0.2,<3.1.0",
 | 
					    "preshed>=3.0.2,<3.1.0",
 | 
				
			||||||
    "murmurhash>=0.28.0,<1.1.0",
 | 
					    "murmurhash>=0.28.0,<1.1.0",
 | 
				
			||||||
    "thinc>=8.0.0a18,<8.0.0a20",
 | 
					    "thinc>=8.0.0a19,<8.0.0a30",
 | 
				
			||||||
    "blis>=0.4.0,<0.5.0",
 | 
					    "blis>=0.4.0,<0.5.0",
 | 
				
			||||||
    "pytokenizations"
 | 
					    "pytokenizations"
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,11 @@
 | 
				
			||||||
# Our libraries
 | 
					# Our libraries
 | 
				
			||||||
cymem>=2.0.2,<2.1.0
 | 
					cymem>=2.0.2,<2.1.0
 | 
				
			||||||
preshed>=3.0.2,<3.1.0
 | 
					preshed>=3.0.2,<3.1.0
 | 
				
			||||||
thinc>=8.0.0a18,<8.0.0a20
 | 
					thinc>=8.0.0a19,<8.0.0a30
 | 
				
			||||||
blis>=0.4.0,<0.5.0
 | 
					blis>=0.4.0,<0.5.0
 | 
				
			||||||
ml_datasets>=0.1.1
 | 
					ml_datasets>=0.1.1
 | 
				
			||||||
murmurhash>=0.28.0,<1.1.0
 | 
					murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
wasabi>=0.7.0,<1.1.0
 | 
					wasabi>=0.7.1,<1.1.0
 | 
				
			||||||
srsly>=2.1.0,<3.0.0
 | 
					srsly>=2.1.0,<3.0.0
 | 
				
			||||||
catalogue>=0.0.7,<1.1.0
 | 
					catalogue>=0.0.7,<1.1.0
 | 
				
			||||||
typer>=0.3.0,<0.4.0
 | 
					typer>=0.3.0,<0.4.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,15 +34,15 @@ setup_requires =
 | 
				
			||||||
    cymem>=2.0.2,<2.1.0
 | 
					    cymem>=2.0.2,<2.1.0
 | 
				
			||||||
    preshed>=3.0.2,<3.1.0
 | 
					    preshed>=3.0.2,<3.1.0
 | 
				
			||||||
    murmurhash>=0.28.0,<1.1.0
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
    thinc>=8.0.0a18,<8.0.0a20
 | 
					    thinc>=8.0.0a19,<8.0.0a30
 | 
				
			||||||
install_requires =
 | 
					install_requires =
 | 
				
			||||||
    # Our libraries
 | 
					    # Our libraries
 | 
				
			||||||
    murmurhash>=0.28.0,<1.1.0
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
    cymem>=2.0.2,<2.1.0
 | 
					    cymem>=2.0.2,<2.1.0
 | 
				
			||||||
    preshed>=3.0.2,<3.1.0
 | 
					    preshed>=3.0.2,<3.1.0
 | 
				
			||||||
    thinc>=8.0.0a18,<8.0.0a20
 | 
					    thinc>=8.0.0a19,<8.0.0a30
 | 
				
			||||||
    blis>=0.4.0,<0.5.0
 | 
					    blis>=0.4.0,<0.5.0
 | 
				
			||||||
    wasabi>=0.7.0,<1.1.0
 | 
					    wasabi>=0.7.1,<1.1.0
 | 
				
			||||||
    srsly>=2.1.0,<3.0.0
 | 
					    srsly>=2.1.0,<3.0.0
 | 
				
			||||||
    catalogue>=0.0.7,<1.1.0
 | 
					    catalogue>=0.0.7,<1.1.0
 | 
				
			||||||
    typer>=0.3.0,<0.4.0
 | 
					    typer>=0.3.0,<0.4.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										8
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -32,8 +32,14 @@ MOD_NAMES = [
 | 
				
			||||||
    "spacy.attrs",
 | 
					    "spacy.attrs",
 | 
				
			||||||
    "spacy.kb",
 | 
					    "spacy.kb",
 | 
				
			||||||
    "spacy.morphology",
 | 
					    "spacy.morphology",
 | 
				
			||||||
    "spacy.pipeline.pipes",
 | 
					    "spacy.pipeline.dep_parser",
 | 
				
			||||||
    "spacy.pipeline.morphologizer",
 | 
					    "spacy.pipeline.morphologizer",
 | 
				
			||||||
 | 
					    "spacy.pipeline.multitask",
 | 
				
			||||||
 | 
					    "spacy.pipeline.ner",
 | 
				
			||||||
 | 
					    "spacy.pipeline.pipe",
 | 
				
			||||||
 | 
					    "spacy.pipeline.sentencizer",
 | 
				
			||||||
 | 
					    "spacy.pipeline.senter",
 | 
				
			||||||
 | 
					    "spacy.pipeline.tagger",
 | 
				
			||||||
    "spacy.syntax.stateclass",
 | 
					    "spacy.syntax.stateclass",
 | 
				
			||||||
    "spacy.syntax._state",
 | 
					    "spacy.syntax._state",
 | 
				
			||||||
    "spacy.tokenizer",
 | 
					    "spacy.tokenizer",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,7 +14,6 @@ from .about import __version__
 | 
				
			||||||
from .errors import Errors, Warnings
 | 
					from .errors import Errors, Warnings
 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
from .util import registry
 | 
					from .util import registry
 | 
				
			||||||
from .language import component
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if sys.maxunicode == 65535:
 | 
					if sys.maxunicode == 65535:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -63,7 +63,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
 | 
				
			||||||
    result = {}
 | 
					    result = {}
 | 
				
			||||||
    while args:
 | 
					    while args:
 | 
				
			||||||
        opt = args.pop(0)
 | 
					        opt = args.pop(0)
 | 
				
			||||||
        err = f"Invalid config override '{opt}'"
 | 
					        err = f"Invalid CLI argument '{opt}'"
 | 
				
			||||||
        if opt.startswith("--"):  # new argument
 | 
					        if opt.startswith("--"):  # new argument
 | 
				
			||||||
            opt = opt.replace("--", "").replace("-", "_")
 | 
					            opt = opt.replace("--", "").replace("-", "_")
 | 
				
			||||||
            if "." not in opt:
 | 
					            if "." not in opt:
 | 
				
			||||||
| 
						 | 
					@ -73,7 +73,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                value = args.pop(0)
 | 
					                value = args.pop(0)
 | 
				
			||||||
            # Just like we do in the config, we're calling json.loads on the
 | 
					            # Just like we do in the config, we're calling json.loads on the
 | 
				
			||||||
            # values. But since they come from the CLI, it'd b unintuitive to
 | 
					            # values. But since they come from the CLI, it'd be unintuitive to
 | 
				
			||||||
            # explicitly mark strings with escaped quotes. So we're working
 | 
					            # explicitly mark strings with escaped quotes. So we're working
 | 
				
			||||||
            # around that here by falling back to a string if parsing fails.
 | 
					            # around that here by falling back to a string if parsing fails.
 | 
				
			||||||
            # TODO: improve logic to handle simple types like list of strings?
 | 
					            # TODO: improve logic to handle simple types like list of strings?
 | 
				
			||||||
| 
						 | 
					@ -82,7 +82,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
 | 
				
			||||||
            except ValueError:
 | 
					            except ValueError:
 | 
				
			||||||
                result[opt] = str(value)
 | 
					                result[opt] = str(value)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            msg.fail(f"{err}: options need to start with --", exits=1)
 | 
					            msg.fail(f"{err}: override option should start with --", exits=1)
 | 
				
			||||||
    return result
 | 
					    return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,12 +3,12 @@ from pathlib import Path
 | 
				
			||||||
from collections import Counter
 | 
					from collections import Counter
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from wasabi import Printer, MESSAGES, msg
 | 
					from wasabi import Printer, MESSAGES, msg, diff_strings
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 | 
					from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 | 
				
			||||||
from ._util import import_code, debug_cli
 | 
					from ._util import import_code, debug_cli
 | 
				
			||||||
from ..schemas import ConfigSchema
 | 
					 | 
				
			||||||
from ..gold import Corpus, Example
 | 
					from ..gold import Corpus, Example
 | 
				
			||||||
from ..syntax import nonproj
 | 
					from ..syntax import nonproj
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
| 
						 | 
					@ -33,6 +33,9 @@ def debug_config_cli(
 | 
				
			||||||
    ctx: typer.Context,  # This is only used to read additional arguments
 | 
					    ctx: typer.Context,  # This is only used to read additional arguments
 | 
				
			||||||
    config_path: Path = Arg(..., help="Path to config file", exists=True),
 | 
					    config_path: Path = Arg(..., help="Path to config file", exists=True),
 | 
				
			||||||
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
					    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
				
			||||||
 | 
					    output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
 | 
				
			||||||
 | 
					    auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
 | 
				
			||||||
 | 
					    diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """Debug a config.cfg file and show validation errors. The command will
 | 
					    """Debug a config.cfg file and show validation errors. The command will
 | 
				
			||||||
| 
						 | 
					@ -40,14 +43,37 @@ def debug_config_cli(
 | 
				
			||||||
    validation errors are blocking and will prevent the rest of the config from
 | 
					    validation errors are blocking and will prevent the rest of the config from
 | 
				
			||||||
    being resolved. This means that you may not see all validation errors at
 | 
					    being resolved. This means that you may not see all validation errors at
 | 
				
			||||||
    once and some issues are only shown once previous errors have been fixed.
 | 
					    once and some issues are only shown once previous errors have been fixed.
 | 
				
			||||||
 | 
					    Similar as with the 'train' command, you can override settings from the config
 | 
				
			||||||
 | 
					    as command line options. For instance, --training.batch_size 128 overrides
 | 
				
			||||||
 | 
					    the value of "batch_size" in the block "[training]".
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    overrides = parse_config_overrides(ctx.args)
 | 
					    overrides = parse_config_overrides(ctx.args)
 | 
				
			||||||
    import_code(code_path)
 | 
					    import_code(code_path)
 | 
				
			||||||
    with show_validation_error():
 | 
					    with show_validation_error():
 | 
				
			||||||
        util.load_config(
 | 
					        config = Config().from_disk(config_path)
 | 
				
			||||||
            config_path, create_objects=False, schema=ConfigSchema, overrides=overrides,
 | 
					        try:
 | 
				
			||||||
 | 
					            nlp, _ = util.load_model_from_config(
 | 
				
			||||||
 | 
					                config, overrides=overrides, auto_fill=auto_fill
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
    msg.good("Config is valid")
 | 
					        except ValueError as e:
 | 
				
			||||||
 | 
					            msg.fail(str(e), exits=1)
 | 
				
			||||||
 | 
					    is_stdout = output_path is not None and str(output_path) == "-"
 | 
				
			||||||
 | 
					    if auto_fill:
 | 
				
			||||||
 | 
					        orig_config = config.to_str()
 | 
				
			||||||
 | 
					        filled_config = nlp.config.to_str()
 | 
				
			||||||
 | 
					        if orig_config == filled_config:
 | 
				
			||||||
 | 
					            msg.good("Original config is valid, no values were auto-filled")
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            msg.good("Auto-filled config is valid")
 | 
				
			||||||
 | 
					            if diff:
 | 
				
			||||||
 | 
					                print(diff_strings(config.to_str(), nlp.config.to_str()))
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        msg.good("Original config is valid", show=not is_stdout)
 | 
				
			||||||
 | 
					    if is_stdout:
 | 
				
			||||||
 | 
					        print(nlp.config.to_str())
 | 
				
			||||||
 | 
					    elif output_path is not None:
 | 
				
			||||||
 | 
					        nlp.config.to_disk(output_path)
 | 
				
			||||||
 | 
					        msg.good(f"Saved updated config to {output_path}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@debug_cli.command(
 | 
					@debug_cli.command(
 | 
				
			||||||
| 
						 | 
					@ -117,16 +143,13 @@ def debug_data(
 | 
				
			||||||
    if not config_path.exists():
 | 
					    if not config_path.exists():
 | 
				
			||||||
        msg.fail("Config file not found", config_path, exists=1)
 | 
					        msg.fail("Config file not found", config_path, exists=1)
 | 
				
			||||||
    with show_validation_error():
 | 
					    with show_validation_error():
 | 
				
			||||||
        config = util.load_config(
 | 
					        cfg = Config().from_disk(config_path)
 | 
				
			||||||
            config_path,
 | 
					        nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
 | 
				
			||||||
            create_objects=False,
 | 
					    # TODO: handle base model
 | 
				
			||||||
            schema=ConfigSchema,
 | 
					 | 
				
			||||||
            overrides=config_overrides,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    nlp = util.load_model_from_config(config["nlp"])
 | 
					 | 
				
			||||||
    lang = config["nlp"]["lang"]
 | 
					    lang = config["nlp"]["lang"]
 | 
				
			||||||
    base_model = config["nlp"]["base_model"]
 | 
					    base_model = config["training"]["base_model"]
 | 
				
			||||||
    pipeline = list(config["nlp"]["pipeline"].keys())
 | 
					    pipeline = nlp.pipe_names
 | 
				
			||||||
 | 
					    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
 | 
				
			||||||
    tag_map_path = util.ensure_path(config["training"]["tag_map"])
 | 
					    tag_map_path = util.ensure_path(config["training"]["tag_map"])
 | 
				
			||||||
    tag_map = {}
 | 
					    tag_map = {}
 | 
				
			||||||
    if tag_map_path is not None:
 | 
					    if tag_map_path is not None:
 | 
				
			||||||
| 
						 | 
					@ -164,19 +187,17 @@ def debug_data(
 | 
				
			||||||
    msg.good("Corpus is loadable")
 | 
					    msg.good("Corpus is loadable")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create all gold data here to avoid iterating over the train_dataset constantly
 | 
					    # Create all gold data here to avoid iterating over the train_dataset constantly
 | 
				
			||||||
    gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
 | 
					    gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
 | 
				
			||||||
    gold_train_unpreprocessed_data = _compile_gold(
 | 
					    gold_train_unpreprocessed_data = _compile_gold(
 | 
				
			||||||
        train_dataset, pipeline, nlp, make_proj=False
 | 
					        train_dataset, factory_names, nlp, make_proj=False
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
 | 
					    gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    train_texts = gold_train_data["texts"]
 | 
					    train_texts = gold_train_data["texts"]
 | 
				
			||||||
    dev_texts = gold_dev_data["texts"]
 | 
					    dev_texts = gold_dev_data["texts"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.divider("Training stats")
 | 
					    msg.divider("Training stats")
 | 
				
			||||||
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
 | 
					    msg.text(f"Training pipeline: {', '.join(pipeline)}")
 | 
				
			||||||
    for pipe in [p for p in pipeline if p not in nlp.factories]:
 | 
					 | 
				
			||||||
        msg.fail(f"Pipeline component '{pipe}' not available in factories")
 | 
					 | 
				
			||||||
    if base_model:
 | 
					    if base_model:
 | 
				
			||||||
        msg.text(f"Starting with base model '{base_model}'")
 | 
					        msg.text(f"Starting with base model '{base_model}'")
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
| 
						 | 
					@ -244,7 +265,7 @@ def debug_data(
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.info("No word vectors present in the model")
 | 
					        msg.info("No word vectors present in the model")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if "ner" in pipeline:
 | 
					    if "ner" in factory_names:
 | 
				
			||||||
        # Get all unique NER labels present in the data
 | 
					        # Get all unique NER labels present in the data
 | 
				
			||||||
        labels = set(
 | 
					        labels = set(
 | 
				
			||||||
            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
 | 
					            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
 | 
				
			||||||
| 
						 | 
					@ -332,7 +353,7 @@ def debug_data(
 | 
				
			||||||
                "with punctuation can not be trained with a noise level > 0."
 | 
					                "with punctuation can not be trained with a noise level > 0."
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if "textcat" in pipeline:
 | 
					    if "textcat" in factory_names:
 | 
				
			||||||
        msg.divider("Text Classification")
 | 
					        msg.divider("Text Classification")
 | 
				
			||||||
        labels = [label for label in gold_train_data["cats"]]
 | 
					        labels = [label for label in gold_train_data["cats"]]
 | 
				
			||||||
        model_labels = _get_labels_from_model(nlp, "textcat")
 | 
					        model_labels = _get_labels_from_model(nlp, "textcat")
 | 
				
			||||||
| 
						 | 
					@ -379,7 +400,7 @@ def debug_data(
 | 
				
			||||||
                    "contains only instances with mutually-exclusive classes."
 | 
					                    "contains only instances with mutually-exclusive classes."
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if "tagger" in pipeline:
 | 
					    if "tagger" in factory_names:
 | 
				
			||||||
        msg.divider("Part-of-speech Tagging")
 | 
					        msg.divider("Part-of-speech Tagging")
 | 
				
			||||||
        labels = [label for label in gold_train_data["tags"]]
 | 
					        labels = [label for label in gold_train_data["tags"]]
 | 
				
			||||||
        tag_map = nlp.vocab.morphology.tag_map
 | 
					        tag_map = nlp.vocab.morphology.tag_map
 | 
				
			||||||
| 
						 | 
					@ -394,7 +415,7 @@ def debug_data(
 | 
				
			||||||
        for label in non_tagmap:
 | 
					        for label in non_tagmap:
 | 
				
			||||||
            msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
 | 
					            msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if "parser" in pipeline:
 | 
					    if "parser" in factory_names:
 | 
				
			||||||
        has_low_data_warning = False
 | 
					        has_low_data_warning = False
 | 
				
			||||||
        msg.divider("Dependency Parsing")
 | 
					        msg.divider("Dependency Parsing")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -541,7 +562,10 @@ def _load_file(file_path: Path, msg: Printer) -> None:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _compile_gold(
 | 
					def _compile_gold(
 | 
				
			||||||
    examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
 | 
					    examples: Sequence[Example],
 | 
				
			||||||
 | 
					    factory_names: List[str],
 | 
				
			||||||
 | 
					    nlp: Language,
 | 
				
			||||||
 | 
					    make_proj: bool,
 | 
				
			||||||
) -> Dict[str, Any]:
 | 
					) -> Dict[str, Any]:
 | 
				
			||||||
    data = {
 | 
					    data = {
 | 
				
			||||||
        "ner": Counter(),
 | 
					        "ner": Counter(),
 | 
				
			||||||
| 
						 | 
					@ -573,7 +597,7 @@ def _compile_gold(
 | 
				
			||||||
            for word in valid_words:
 | 
					            for word in valid_words:
 | 
				
			||||||
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
 | 
					                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
 | 
				
			||||||
                    data["words_missing_vectors"].update([word])
 | 
					                    data["words_missing_vectors"].update([word])
 | 
				
			||||||
        if "ner" in pipeline:
 | 
					        if "ner" in factory_names:
 | 
				
			||||||
            for i, label in enumerate(eg.get_aligned_ner()):
 | 
					            for i, label in enumerate(eg.get_aligned_ner()):
 | 
				
			||||||
                if label is None:
 | 
					                if label is None:
 | 
				
			||||||
                    continue
 | 
					                    continue
 | 
				
			||||||
| 
						 | 
					@ -595,14 +619,14 @@ def _compile_gold(
 | 
				
			||||||
                    data["ner"][combined_label] += 1
 | 
					                    data["ner"][combined_label] += 1
 | 
				
			||||||
                elif label == "-":
 | 
					                elif label == "-":
 | 
				
			||||||
                    data["ner"]["-"] += 1
 | 
					                    data["ner"]["-"] += 1
 | 
				
			||||||
        if "textcat" in pipeline:
 | 
					        if "textcat" in factory_names:
 | 
				
			||||||
            data["cats"].update(gold.cats)
 | 
					            data["cats"].update(gold.cats)
 | 
				
			||||||
            if list(gold.cats.values()).count(1.0) != 1:
 | 
					            if list(gold.cats.values()).count(1.0) != 1:
 | 
				
			||||||
                data["n_cats_multilabel"] += 1
 | 
					                data["n_cats_multilabel"] += 1
 | 
				
			||||||
        if "tagger" in pipeline:
 | 
					        if "tagger" in factory_names:
 | 
				
			||||||
            tags = eg.get_aligned("TAG", as_string=True)
 | 
					            tags = eg.get_aligned("TAG", as_string=True)
 | 
				
			||||||
            data["tags"].update([x for x in tags if x is not None])
 | 
					            data["tags"].update([x for x in tags if x is not None])
 | 
				
			||||||
        if "parser" in pipeline:
 | 
					        if "parser" in factory_names:
 | 
				
			||||||
            aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
 | 
					            aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
 | 
				
			||||||
            data["deps"].update([x for x in aligned_deps if x is not None])
 | 
					            data["deps"].update([x for x in aligned_deps if x is not None])
 | 
				
			||||||
            for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
 | 
					            for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,11 @@
 | 
				
			||||||
 | 
					from typing import Dict, Any, Optional
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
 | 
					from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
 | 
				
			||||||
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import Arg, Opt, debug_cli
 | 
					from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..lang.en import English
 | 
					from ..lang.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,8 +13,10 @@ from ..lang.en import English
 | 
				
			||||||
@debug_cli.command("model")
 | 
					@debug_cli.command("model")
 | 
				
			||||||
def debug_model_cli(
 | 
					def debug_model_cli(
 | 
				
			||||||
    # fmt: off
 | 
					    # fmt: off
 | 
				
			||||||
 | 
					    ctx: typer.Context,  # This is only used to read additional arguments
 | 
				
			||||||
    config_path: Path = Arg(..., help="Path to config file", exists=True),
 | 
					    config_path: Path = Arg(..., help="Path to config file", exists=True),
 | 
				
			||||||
    layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
 | 
					    section: str = Arg(..., help="Section that defines the model to be analysed"),
 | 
				
			||||||
 | 
					    layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
 | 
				
			||||||
    dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
 | 
					    dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
 | 
				
			||||||
    parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
 | 
					    parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
 | 
				
			||||||
    gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
 | 
					    gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
 | 
				
			||||||
| 
						 | 
					@ -20,14 +25,18 @@ def debug_model_cli(
 | 
				
			||||||
    P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
 | 
					    P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
 | 
				
			||||||
    P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
 | 
					    P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
 | 
				
			||||||
    P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
 | 
					    P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
 | 
				
			||||||
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
 | 
					    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU")
 | 
				
			||||||
    seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Analyze a Thinc model implementation. Includes checks for internal structure
 | 
					    Analyze a Thinc model implementation. Includes checks for internal structure
 | 
				
			||||||
    and activations during training.
 | 
					    and activations during training.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    if use_gpu >= 0:
 | 
				
			||||||
 | 
					        msg.info("Using GPU")
 | 
				
			||||||
 | 
					        require_gpu(use_gpu)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        msg.info("Using CPU")
 | 
				
			||||||
    print_settings = {
 | 
					    print_settings = {
 | 
				
			||||||
        "dimensions": dimensions,
 | 
					        "dimensions": dimensions,
 | 
				
			||||||
        "parameters": parameters,
 | 
					        "parameters": parameters,
 | 
				
			||||||
| 
						 | 
					@ -39,27 +48,47 @@ def debug_model_cli(
 | 
				
			||||||
        "print_after_training": P2,
 | 
					        "print_after_training": P2,
 | 
				
			||||||
        "print_prediction": P3,
 | 
					        "print_prediction": P3,
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    config_overrides = parse_config_overrides(ctx.args)
 | 
				
			||||||
 | 
					    cfg = Config().from_disk(config_path)
 | 
				
			||||||
 | 
					    with show_validation_error():
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            _, config = util.load_model_from_config(cfg, overrides=config_overrides)
 | 
				
			||||||
 | 
					        except ValueError as e:
 | 
				
			||||||
 | 
					            msg.fail(str(e), exits=1)
 | 
				
			||||||
 | 
					    seed = config["pretraining"]["seed"]
 | 
				
			||||||
    if seed is not None:
 | 
					    if seed is not None:
 | 
				
			||||||
        msg.info(f"Fixing random seed: {seed}")
 | 
					        msg.info(f"Fixing random seed: {seed}")
 | 
				
			||||||
        fix_random_seed(seed)
 | 
					        fix_random_seed(seed)
 | 
				
			||||||
    if use_gpu >= 0:
 | 
					 | 
				
			||||||
        msg.info(f"Using GPU: {use_gpu}")
 | 
					 | 
				
			||||||
        require_gpu(use_gpu)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        msg.info(f"Using CPU")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    debug_model(
 | 
					    component = config
 | 
				
			||||||
        config_path, print_settings=print_settings,
 | 
					    parts = section.split(".")
 | 
				
			||||||
 | 
					    for item in parts:
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            component = component[item]
 | 
				
			||||||
 | 
					        except KeyError:
 | 
				
			||||||
 | 
					            msg.fail(
 | 
				
			||||||
 | 
					                f"The section '{section}' is not a valid section in the provided config.",
 | 
				
			||||||
 | 
					                exits=1,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					    if hasattr(component, "model"):
 | 
				
			||||||
 | 
					        model = component.model
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        msg.fail(
 | 
				
			||||||
 | 
					            f"The section '{section}' does not specify an object that holds a Model.",
 | 
				
			||||||
 | 
					            exits=1,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    debug_model(model, print_settings=print_settings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def debug_model(config_path: Path, *, print_settings=None):
 | 
					def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
 | 
				
			||||||
 | 
					    if not isinstance(model, Model):
 | 
				
			||||||
 | 
					        msg.fail(
 | 
				
			||||||
 | 
					            f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
 | 
				
			||||||
 | 
					            exits=1,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    if print_settings is None:
 | 
					    if print_settings is None:
 | 
				
			||||||
        print_settings = {}
 | 
					        print_settings = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    model = util.load_config(config_path, create_objects=True)["model"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # STEP 0: Printing before training
 | 
					    # STEP 0: Printing before training
 | 
				
			||||||
    msg.info(f"Analysing model with ID {model.id}")
 | 
					    msg.info(f"Analysing model with ID {model.id}")
 | 
				
			||||||
    if print_settings.get("print_before_training"):
 | 
					    if print_settings.get("print_before_training"):
 | 
				
			||||||
| 
						 | 
					@ -67,7 +96,9 @@ def debug_model(config_path: Path, *, print_settings=None):
 | 
				
			||||||
        _print_model(model, print_settings)
 | 
					        _print_model(model, print_settings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # STEP 1: Initializing the model and printing again
 | 
					    # STEP 1: Initializing the model and printing again
 | 
				
			||||||
    model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
 | 
					    Y = _get_output(model.ops.xp)
 | 
				
			||||||
 | 
					    _set_output_dim(nO=Y.shape[-1], model=model)
 | 
				
			||||||
 | 
					    model.initialize(X=_get_docs(), Y=Y)
 | 
				
			||||||
    if print_settings.get("print_after_init"):
 | 
					    if print_settings.get("print_after_init"):
 | 
				
			||||||
        msg.info(f"After initialization:")
 | 
					        msg.info(f"After initialization:")
 | 
				
			||||||
        _print_model(model, print_settings)
 | 
					        _print_model(model, print_settings)
 | 
				
			||||||
| 
						 | 
					@ -110,12 +141,16 @@ def _get_docs():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_output(xp):
 | 
					def _get_output(xp):
 | 
				
			||||||
    return xp.asarray(
 | 
					    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
 | 
				
			||||||
        [
 | 
					
 | 
				
			||||||
            xp.asarray([i + 10, i + 20, i + 30], dtype="float32")
 | 
					
 | 
				
			||||||
            for i, _ in enumerate(_get_docs())
 | 
					def _set_output_dim(model, nO):
 | 
				
			||||||
        ]
 | 
					    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
 | 
				
			||||||
    )
 | 
					    if model.has_dim("nO") is None:
 | 
				
			||||||
 | 
					        model.set_dim("nO", nO)
 | 
				
			||||||
 | 
					    if model.has_ref("output_layer"):
 | 
				
			||||||
 | 
					        if model.get_ref("output_layer").has_dim("nO") is None:
 | 
				
			||||||
 | 
					            model.get_ref("output_layer").set_dim("nO", nO)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _print_model(model, print_settings):
 | 
					def _print_model(model, print_settings):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -105,9 +105,10 @@ def evaluate(
 | 
				
			||||||
        print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
 | 
					        print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if displacy_path:
 | 
					    if displacy_path:
 | 
				
			||||||
 | 
					        factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
 | 
				
			||||||
        docs = [ex.predicted for ex in dev_dataset]
 | 
					        docs = [ex.predicted for ex in dev_dataset]
 | 
				
			||||||
        render_deps = "parser" in nlp.meta.get("pipeline", [])
 | 
					        render_deps = "parser" in factory_names
 | 
				
			||||||
        render_ents = "ner" in nlp.meta.get("pipeline", [])
 | 
					        render_ents = "ner" in factory_names
 | 
				
			||||||
        render_parses(
 | 
					        render_parses(
 | 
				
			||||||
            docs,
 | 
					            docs,
 | 
				
			||||||
            displacy_path,
 | 
					            displacy_path,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -84,7 +84,6 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
 | 
				
			||||||
        msg.fail("Can't find model meta.json", meta_path, exits=1)
 | 
					        msg.fail("Can't find model meta.json", meta_path, exits=1)
 | 
				
			||||||
    meta = srsly.read_json(meta_path)
 | 
					    meta = srsly.read_json(meta_path)
 | 
				
			||||||
    if model_path.resolve() != model_path:
 | 
					    if model_path.resolve() != model_path:
 | 
				
			||||||
        meta["link"] = str(model_path)
 | 
					 | 
				
			||||||
        meta["source"] = str(model_path.resolve())
 | 
					        meta["source"] = str(model_path.resolve())
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        meta["source"] = str(model_path)
 | 
					        meta["source"] = str(model_path)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -125,7 +125,6 @@ def get_meta(
 | 
				
			||||||
    meta.update(existing_meta)
 | 
					    meta.update(existing_meta)
 | 
				
			||||||
    nlp = util.load_model_from_path(Path(model_path))
 | 
					    nlp = util.load_model_from_path(Path(model_path))
 | 
				
			||||||
    meta["spacy_version"] = util.get_model_version_range(about.__version__)
 | 
					    meta["spacy_version"] = util.get_model_version_range(about.__version__)
 | 
				
			||||||
    meta["pipeline"] = nlp.pipe_names
 | 
					 | 
				
			||||||
    meta["vectors"] = {
 | 
					    meta["vectors"] = {
 | 
				
			||||||
        "width": nlp.vocab.vectors_length,
 | 
					        "width": nlp.vocab.vectors_length,
 | 
				
			||||||
        "vectors": len(nlp.vocab.vectors),
 | 
					        "vectors": len(nlp.vocab.vectors),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,7 +5,7 @@ import time
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
from collections import Counter
 | 
					from collections import Counter
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
 | 
					from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
 | 
				
			||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
 | 
					from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
 | 
				
			||||||
from thinc.api import CosineDistance, L2Distance
 | 
					from thinc.api import CosineDistance, L2Distance
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
| 
						 | 
					@ -15,7 +15,6 @@ import typer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
				
			||||||
from ._util import import_code
 | 
					from ._util import import_code
 | 
				
			||||||
from ..schemas import ConfigSchema
 | 
					 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..ml.models.multi_task import build_cloze_multi_task_model
 | 
					from ..ml.models.multi_task import build_cloze_multi_task_model
 | 
				
			||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 | 
					from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 | 
				
			||||||
| 
						 | 
					@ -37,6 +36,7 @@ def pretrain_cli(
 | 
				
			||||||
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
					    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
				
			||||||
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
 | 
					    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
 | 
				
			||||||
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
 | 
					    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
 | 
				
			||||||
 | 
					    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -67,6 +67,7 @@ def pretrain_cli(
 | 
				
			||||||
        config_overrides=overrides,
 | 
					        config_overrides=overrides,
 | 
				
			||||||
        resume_path=resume_path,
 | 
					        resume_path=resume_path,
 | 
				
			||||||
        epoch_resume=epoch_resume,
 | 
					        epoch_resume=epoch_resume,
 | 
				
			||||||
 | 
					        use_gpu=use_gpu,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,40 +78,29 @@ def pretrain(
 | 
				
			||||||
    config_overrides: Dict[str, Any] = {},
 | 
					    config_overrides: Dict[str, Any] = {},
 | 
				
			||||||
    resume_path: Optional[Path] = None,
 | 
					    resume_path: Optional[Path] = None,
 | 
				
			||||||
    epoch_resume: Optional[int] = None,
 | 
					    epoch_resume: Optional[int] = None,
 | 
				
			||||||
 | 
					    use_gpu: int = -1,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
 | 
					    verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
 | 
				
			||||||
    msg.info(f"Loading config from: {config_path}")
 | 
					 | 
				
			||||||
    with show_validation_error():
 | 
					 | 
				
			||||||
        config = util.load_config(
 | 
					 | 
				
			||||||
            config_path,
 | 
					 | 
				
			||||||
            create_objects=False,
 | 
					 | 
				
			||||||
            validate=True,
 | 
					 | 
				
			||||||
            schema=ConfigSchema,
 | 
					 | 
				
			||||||
            overrides=config_overrides,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    if not output_dir.exists():
 | 
					 | 
				
			||||||
        output_dir.mkdir()
 | 
					 | 
				
			||||||
        msg.good(f"Created output directory: {output_dir}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    use_gpu = config["training"]["use_gpu"]
 | 
					 | 
				
			||||||
    if use_gpu >= 0:
 | 
					    if use_gpu >= 0:
 | 
				
			||||||
        msg.info("Using GPU")
 | 
					        msg.info("Using GPU")
 | 
				
			||||||
        require_gpu(use_gpu)
 | 
					        require_gpu(use_gpu)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.info("Using CPU")
 | 
					        msg.info("Using CPU")
 | 
				
			||||||
 | 
					    msg.info(f"Loading config from: {config_path}")
 | 
				
			||||||
 | 
					    config = Config().from_disk(config_path)
 | 
				
			||||||
 | 
					    with show_validation_error():
 | 
				
			||||||
 | 
					        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
 | 
				
			||||||
 | 
					    # TODO: validate that [pretraining] block exists
 | 
				
			||||||
 | 
					    if not output_dir.exists():
 | 
				
			||||||
 | 
					        output_dir.mkdir()
 | 
				
			||||||
 | 
					        msg.good(f"Created output directory: {output_dir}")
 | 
				
			||||||
    seed = config["pretraining"]["seed"]
 | 
					    seed = config["pretraining"]["seed"]
 | 
				
			||||||
    if seed is not None:
 | 
					    if seed is not None:
 | 
				
			||||||
        fix_random_seed(seed)
 | 
					        fix_random_seed(seed)
 | 
				
			||||||
    if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
 | 
					    if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
 | 
				
			||||||
        use_pytorch_for_gpu_memory()
 | 
					        use_pytorch_for_gpu_memory()
 | 
				
			||||||
 | 
					    config.to_disk(output_dir / "config.cfg")
 | 
				
			||||||
    nlp_config = config["nlp"]
 | 
					 | 
				
			||||||
    srsly.write_json(output_dir / "config.json", config)
 | 
					 | 
				
			||||||
    msg.good("Saved config file in the output directory")
 | 
					    msg.good("Saved config file in the output directory")
 | 
				
			||||||
 | 
					 | 
				
			||||||
    config = util.load_config(config_path, create_objects=True)
 | 
					 | 
				
			||||||
    nlp = util.load_model_from_config(nlp_config)
 | 
					 | 
				
			||||||
    pretrain_config = config["pretraining"]
 | 
					    pretrain_config = config["pretraining"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if texts_loc != "-":  # reading from a file
 | 
					    if texts_loc != "-":  # reading from a file
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,7 +25,7 @@ def profile_cli(
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Profile a spaCy pipeline, to find out which functions take the most time.
 | 
					    Profile which functions take the most time in a spaCy pipeline.
 | 
				
			||||||
    Input should be formatted as one JSON object per line with a key "text".
 | 
					    Input should be formatted as one JSON object per line with a key "text".
 | 
				
			||||||
    It can either be provided as a JSONL file, or be read from sys.sytdin.
 | 
					    It can either be provided as a JSONL file, or be read from sys.sytdin.
 | 
				
			||||||
    If no input file is specified, the IMDB dataset is loaded via Thinc.
 | 
					    If no input file is specified, the IMDB dataset is loaded via Thinc.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
from typing import Optional, Dict, Any
 | 
					from typing import Optional, Dict, Any, Tuple, Union, Callable, List
 | 
				
			||||||
from timeit import default_timer as timer
 | 
					from timeit import default_timer as timer
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import tqdm
 | 
					import tqdm
 | 
				
			||||||
| 
						 | 
					@ -7,6 +7,7 @@ from wasabi import msg
 | 
				
			||||||
import thinc
 | 
					import thinc
 | 
				
			||||||
import thinc.schedules
 | 
					import thinc.schedules
 | 
				
			||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
 | 
					from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
 | 
				
			||||||
 | 
					from thinc.api import Config, Optimizer
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,18 +15,15 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
				
			||||||
from ._util import import_code
 | 
					from ._util import import_code
 | 
				
			||||||
from ..gold import Corpus, Example
 | 
					from ..gold import Corpus, Example
 | 
				
			||||||
from ..lookups import Lookups
 | 
					from ..lookups import Lookups
 | 
				
			||||||
 | 
					from ..language import Language
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..schemas import ConfigSchema
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Don't remove - required to load the built-in architectures
 | 
					# Don't remove - required to load the built-in architectures
 | 
				
			||||||
from ..ml import models  # noqa: F401
 | 
					from ..ml import models  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
registry = util.registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
    "train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
 | 
					    "train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
| 
						 | 
					@ -38,6 +36,8 @@ def train_cli(
 | 
				
			||||||
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
 | 
					    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
 | 
				
			||||||
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
					    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
				
			||||||
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
					    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
				
			||||||
 | 
					    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
 | 
				
			||||||
 | 
					    resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -53,9 +53,7 @@ def train_cli(
 | 
				
			||||||
    referenced in the config.
 | 
					    referenced in the config.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    util.set_env_log(verbose)
 | 
					    util.set_env_log(verbose)
 | 
				
			||||||
    verify_cli_args(
 | 
					    verify_cli_args(train_path, dev_path, config_path)
 | 
				
			||||||
        train_path=train_path, dev_path=dev_path, config_path=config_path,
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    overrides = parse_config_overrides(ctx.args)
 | 
					    overrides = parse_config_overrides(ctx.args)
 | 
				
			||||||
    import_code(code_path)
 | 
					    import_code(code_path)
 | 
				
			||||||
    train(
 | 
					    train(
 | 
				
			||||||
| 
						 | 
					@ -63,6 +61,8 @@ def train_cli(
 | 
				
			||||||
        {"train": train_path, "dev": dev_path},
 | 
					        {"train": train_path, "dev": dev_path},
 | 
				
			||||||
        output_path=output_path,
 | 
					        output_path=output_path,
 | 
				
			||||||
        config_overrides=overrides,
 | 
					        config_overrides=overrides,
 | 
				
			||||||
 | 
					        use_gpu=use_gpu,
 | 
				
			||||||
 | 
					        resume_training=resume,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -72,61 +72,51 @@ def train(
 | 
				
			||||||
    raw_text: Optional[Path] = None,
 | 
					    raw_text: Optional[Path] = None,
 | 
				
			||||||
    output_path: Optional[Path] = None,
 | 
					    output_path: Optional[Path] = None,
 | 
				
			||||||
    config_overrides: Dict[str, Any] = {},
 | 
					    config_overrides: Dict[str, Any] = {},
 | 
				
			||||||
 | 
					    use_gpu: int = -1,
 | 
				
			||||||
 | 
					    resume_training: bool = False,
 | 
				
			||||||
) -> None:
 | 
					) -> None:
 | 
				
			||||||
    msg.info(f"Loading config from: {config_path}")
 | 
					 | 
				
			||||||
    # Read the config first without creating objects, to get to the original nlp_config
 | 
					 | 
				
			||||||
    with show_validation_error():
 | 
					 | 
				
			||||||
        config = util.load_config(
 | 
					 | 
				
			||||||
            config_path,
 | 
					 | 
				
			||||||
            create_objects=False,
 | 
					 | 
				
			||||||
            schema=ConfigSchema,
 | 
					 | 
				
			||||||
            overrides=config_overrides,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    use_gpu = config["training"]["use_gpu"]
 | 
					 | 
				
			||||||
    if use_gpu >= 0:
 | 
					    if use_gpu >= 0:
 | 
				
			||||||
        msg.info(f"Using GPU: {use_gpu}")
 | 
					        msg.info(f"Using GPU: {use_gpu}")
 | 
				
			||||||
        require_gpu(use_gpu)
 | 
					        require_gpu(use_gpu)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.info("Using CPU")
 | 
					        msg.info("Using CPU")
 | 
				
			||||||
 | 
					    msg.info(f"Loading config and nlp from: {config_path}")
 | 
				
			||||||
 | 
					    config = Config().from_disk(config_path)
 | 
				
			||||||
 | 
					    with show_validation_error():
 | 
				
			||||||
 | 
					        nlp, config = util.load_model_from_config(config, overrides=config_overrides)
 | 
				
			||||||
 | 
					    if config["training"]["base_model"]:
 | 
				
			||||||
 | 
					        base_nlp = util.load_model(config["training"]["base_model"])
 | 
				
			||||||
 | 
					        # TODO: do something to check base_nlp against regular nlp described in config?
 | 
				
			||||||
 | 
					        nlp = base_nlp
 | 
				
			||||||
 | 
					    verify_config(nlp)
 | 
				
			||||||
    raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
 | 
					    raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
 | 
				
			||||||
    if config["training"]["seed"] is not None:
 | 
					    if config["training"]["seed"] is not None:
 | 
				
			||||||
        fix_random_seed(config["training"]["seed"])
 | 
					        fix_random_seed(config["training"]["seed"])
 | 
				
			||||||
    if config["training"].get("use_pytorch_for_gpu_memory"):
 | 
					    if config["training"]["use_pytorch_for_gpu_memory"]:
 | 
				
			||||||
        # It feels kind of weird to not have a default for this.
 | 
					        # It feels kind of weird to not have a default for this.
 | 
				
			||||||
        use_pytorch_for_gpu_memory()
 | 
					        use_pytorch_for_gpu_memory()
 | 
				
			||||||
    nlp_config = config["nlp"]
 | 
					 | 
				
			||||||
    config = util.load_config(
 | 
					 | 
				
			||||||
        config_path,
 | 
					 | 
				
			||||||
        create_objects=True,
 | 
					 | 
				
			||||||
        schema=ConfigSchema,
 | 
					 | 
				
			||||||
        overrides=config_overrides,
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    training = config["training"]
 | 
					    training = config["training"]
 | 
				
			||||||
    msg.info("Creating nlp from config")
 | 
					 | 
				
			||||||
    nlp = util.load_model_from_config(nlp_config)
 | 
					 | 
				
			||||||
    optimizer = training["optimizer"]
 | 
					    optimizer = training["optimizer"]
 | 
				
			||||||
    limit = training["limit"]
 | 
					    limit = training["limit"]
 | 
				
			||||||
    corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
 | 
					    corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
 | 
				
			||||||
    if "textcat" in nlp_config["pipeline"]:
 | 
					    if resume_training:
 | 
				
			||||||
        verify_textcat_config(nlp, nlp_config)
 | 
					 | 
				
			||||||
    if training.get("resume", False):
 | 
					 | 
				
			||||||
        msg.info("Resuming training")
 | 
					        msg.info("Resuming training")
 | 
				
			||||||
        nlp.resume_training()
 | 
					        nlp.resume_training()
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
 | 
					        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
 | 
				
			||||||
        train_examples = list(
 | 
					        train_examples = corpus.train_dataset(
 | 
				
			||||||
            corpus.train_dataset(
 | 
					 | 
				
			||||||
            nlp,
 | 
					            nlp,
 | 
				
			||||||
            shuffle=False,
 | 
					            shuffle=False,
 | 
				
			||||||
            gold_preproc=training["gold_preproc"],
 | 
					            gold_preproc=training["gold_preproc"],
 | 
				
			||||||
            max_length=training["max_length"],
 | 
					            max_length=training["max_length"],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        )
 | 
					        train_examples = list(train_examples)
 | 
				
			||||||
        nlp.begin_training(lambda: train_examples)
 | 
					        nlp.begin_training(lambda: train_examples)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if tag_map:
 | 
				
			||||||
        # Replace tag map with provided mapping
 | 
					        # Replace tag map with provided mapping
 | 
				
			||||||
        nlp.vocab.morphology.load_tag_map(tag_map)
 | 
					        nlp.vocab.morphology.load_tag_map(tag_map)
 | 
				
			||||||
 | 
					    if morph_rules:
 | 
				
			||||||
        # Load morph rules
 | 
					        # Load morph rules
 | 
				
			||||||
        nlp.vocab.morphology.load_morph_exceptions(morph_rules)
 | 
					        nlp.vocab.morphology.load_morph_exceptions(morph_rules)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -151,9 +141,8 @@ def train(
 | 
				
			||||||
        for subpath in tok2vec_path.split("."):
 | 
					        for subpath in tok2vec_path.split("."):
 | 
				
			||||||
            tok2vec = tok2vec.get(subpath)
 | 
					            tok2vec = tok2vec.get(subpath)
 | 
				
			||||||
        if not tok2vec:
 | 
					        if not tok2vec:
 | 
				
			||||||
            msg.fail(
 | 
					            err = f"Could not locate the tok2vec model at {tok2vec_path}"
 | 
				
			||||||
                f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
 | 
					            msg.fail(err, exits=1)
 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        tok2vec.from_bytes(weights_data)
 | 
					        tok2vec.from_bytes(weights_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.info("Loading training corpus")
 | 
					    msg.info("Loading training corpus")
 | 
				
			||||||
| 
						 | 
					@ -169,12 +158,11 @@ def train(
 | 
				
			||||||
        evaluate,
 | 
					        evaluate,
 | 
				
			||||||
        dropout=training["dropout"],
 | 
					        dropout=training["dropout"],
 | 
				
			||||||
        accumulate_gradient=training["accumulate_gradient"],
 | 
					        accumulate_gradient=training["accumulate_gradient"],
 | 
				
			||||||
        patience=training.get("patience", 0),
 | 
					        patience=training["patience"],
 | 
				
			||||||
        max_steps=training.get("max_steps", 0),
 | 
					        max_steps=training["max_steps"],
 | 
				
			||||||
        eval_frequency=training["eval_frequency"],
 | 
					        eval_frequency=training["eval_frequency"],
 | 
				
			||||||
        raw_text=raw_text,
 | 
					        raw_text=raw_text,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					 | 
				
			||||||
    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
 | 
					    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
 | 
				
			||||||
    print_row = setup_printer(training, nlp)
 | 
					    print_row = setup_printer(training, nlp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -209,8 +197,10 @@ def train(
 | 
				
			||||||
            msg.good(f"Saved model to output directory {final_model_path}")
 | 
					            msg.good(f"Saved model to output directory {final_model_path}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def create_train_batches(nlp, corpus, cfg):
 | 
					def create_train_batches(
 | 
				
			||||||
    max_epochs = cfg.get("max_epochs", 0)
 | 
					    nlp: Language, corpus: Corpus, cfg: Union[Config, Dict[str, Any]]
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    max_epochs = cfg["max_epochs"]
 | 
				
			||||||
    train_examples = list(
 | 
					    train_examples = list(
 | 
				
			||||||
        corpus.train_dataset(
 | 
					        corpus.train_dataset(
 | 
				
			||||||
            nlp,
 | 
					            nlp,
 | 
				
			||||||
| 
						 | 
					@ -219,9 +209,8 @@ def create_train_batches(nlp, corpus, cfg):
 | 
				
			||||||
            max_length=cfg["max_length"],
 | 
					            max_length=cfg["max_length"],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					 | 
				
			||||||
    epoch = 0
 | 
					    epoch = 0
 | 
				
			||||||
    batch_strategy = cfg.get("batch_by", "sequences")
 | 
					    batch_strategy = cfg["batch_by"]
 | 
				
			||||||
    while True:
 | 
					    while True:
 | 
				
			||||||
        if len(train_examples) == 0:
 | 
					        if len(train_examples) == 0:
 | 
				
			||||||
            raise ValueError(Errors.E988)
 | 
					            raise ValueError(Errors.E988)
 | 
				
			||||||
| 
						 | 
					@ -241,7 +230,6 @@ def create_train_batches(nlp, corpus, cfg):
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            batches = util.minibatch(train_examples, size=cfg["batch_size"])
 | 
					            batches = util.minibatch(train_examples, size=cfg["batch_size"])
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
 | 
					        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            first = next(batches)
 | 
					            first = next(batches)
 | 
				
			||||||
| 
						 | 
					@ -255,18 +243,20 @@ def create_train_batches(nlp, corpus, cfg):
 | 
				
			||||||
        random.shuffle(train_examples)
 | 
					        random.shuffle(train_examples)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 | 
					def create_evaluation_callback(
 | 
				
			||||||
    def evaluate():
 | 
					    nlp: Language,
 | 
				
			||||||
        dev_examples = list(
 | 
					    optimizer: Optimizer,
 | 
				
			||||||
            corpus.dev_dataset(
 | 
					    corpus: Corpus,
 | 
				
			||||||
 | 
					    cfg: Union[Config, Dict[str, Any]],
 | 
				
			||||||
 | 
					) -> Callable[[], Tuple[float, Dict[str, float]]]:
 | 
				
			||||||
 | 
					    def evaluate() -> Tuple[float, Dict[str, float]]:
 | 
				
			||||||
 | 
					        dev_examples = corpus.dev_dataset(
 | 
				
			||||||
            nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
 | 
					            nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        )
 | 
					        dev_examples = list(dev_examples)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        n_words = sum(len(ex.predicted) for ex in dev_examples)
 | 
					        n_words = sum(len(ex.predicted) for ex in dev_examples)
 | 
				
			||||||
        batch_size = cfg.get("evaluation_batch_size", 128)
 | 
					        batch_size = cfg["eval_batch_size"]
 | 
				
			||||||
        start_time = timer()
 | 
					        start_time = timer()
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if optimizer.averages:
 | 
					        if optimizer.averages:
 | 
				
			||||||
            with nlp.use_params(optimizer.averages):
 | 
					            with nlp.use_params(optimizer.averages):
 | 
				
			||||||
                scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
 | 
					                scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
 | 
				
			||||||
| 
						 | 
					@ -280,12 +270,9 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
 | 
					            weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
 | 
				
			||||||
        except KeyError as e:
 | 
					        except KeyError as e:
 | 
				
			||||||
            raise KeyError(
 | 
					            keys = list(scores.keys())
 | 
				
			||||||
                Errors.E983.format(
 | 
					            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
 | 
				
			||||||
                    dict="score_weights", key=str(e), keys=list(scores.keys())
 | 
					            raise KeyError(err)
 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        scores["speed"] = wps
 | 
					        scores["speed"] = wps
 | 
				
			||||||
        return weighted_score, scores
 | 
					        return weighted_score, scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -293,17 +280,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def train_while_improving(
 | 
					def train_while_improving(
 | 
				
			||||||
    nlp,
 | 
					    nlp: Language,
 | 
				
			||||||
    optimizer,
 | 
					    optimizer: Optimizer,
 | 
				
			||||||
    train_data,
 | 
					    train_data,
 | 
				
			||||||
    evaluate,
 | 
					    evaluate,
 | 
				
			||||||
    *,
 | 
					    *,
 | 
				
			||||||
    dropout,
 | 
					    dropout: float,
 | 
				
			||||||
    eval_frequency,
 | 
					    eval_frequency: int,
 | 
				
			||||||
    accumulate_gradient=1,
 | 
					    accumulate_gradient: int,
 | 
				
			||||||
    patience=0,
 | 
					    patience: int,
 | 
				
			||||||
    max_steps=0,
 | 
					    max_steps: int,
 | 
				
			||||||
    raw_text=None,
 | 
					    raw_text: List[Dict[str, str]],
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """Train until an evaluation stops improving. Works as a generator,
 | 
					    """Train until an evaluation stops improving. Works as a generator,
 | 
				
			||||||
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
 | 
					    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
 | 
				
			||||||
| 
						 | 
					@ -414,7 +401,9 @@ def subdivide_batch(batch, accumulate_gradient):
 | 
				
			||||||
        yield subbatch
 | 
					        yield subbatch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def setup_printer(training, nlp):
 | 
					def setup_printer(
 | 
				
			||||||
 | 
					    training: Union[Dict[str, Any], Config], nlp: Language
 | 
				
			||||||
 | 
					) -> Callable[[Dict[str, Any]], None]:
 | 
				
			||||||
    score_cols = training["scores"]
 | 
					    score_cols = training["scores"]
 | 
				
			||||||
    score_widths = [max(len(col), 6) for col in score_cols]
 | 
					    score_widths = [max(len(col), 6) for col in score_cols]
 | 
				
			||||||
    loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
 | 
					    loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
 | 
				
			||||||
| 
						 | 
					@ -423,11 +412,10 @@ def setup_printer(training, nlp):
 | 
				
			||||||
    table_header = [col.upper() for col in table_header]
 | 
					    table_header = [col.upper() for col in table_header]
 | 
				
			||||||
    table_widths = [3, 6] + loss_widths + score_widths + [6]
 | 
					    table_widths = [3, 6] + loss_widths + score_widths + [6]
 | 
				
			||||||
    table_aligns = ["r" for _ in table_widths]
 | 
					    table_aligns = ["r" for _ in table_widths]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    msg.row(table_header, widths=table_widths)
 | 
					    msg.row(table_header, widths=table_widths)
 | 
				
			||||||
    msg.row(["-" * width for width in table_widths])
 | 
					    msg.row(["-" * width for width in table_widths])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def print_row(info):
 | 
					    def print_row(info: Dict[str, Any]) -> None:
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            losses = [
 | 
					            losses = [
 | 
				
			||||||
                "{0:.2f}".format(float(info["losses"][pipe_name]))
 | 
					                "{0:.2f}".format(float(info["losses"][pipe_name]))
 | 
				
			||||||
| 
						 | 
					@ -463,7 +451,9 @@ def setup_printer(training, nlp):
 | 
				
			||||||
    return print_row
 | 
					    return print_row
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def update_meta(training, nlp, info):
 | 
					def update_meta(
 | 
				
			||||||
 | 
					    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 | 
				
			||||||
 | 
					) -> None:
 | 
				
			||||||
    score_cols = training["scores"]
 | 
					    score_cols = training["scores"]
 | 
				
			||||||
    nlp.meta["performance"] = {}
 | 
					    nlp.meta["performance"] = {}
 | 
				
			||||||
    for metric in score_cols:
 | 
					    for metric in score_cols:
 | 
				
			||||||
| 
						 | 
					@ -472,7 +462,9 @@ def update_meta(training, nlp, info):
 | 
				
			||||||
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 | 
					        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load_from_paths(config):
 | 
					def load_from_paths(
 | 
				
			||||||
 | 
					    config: Config,
 | 
				
			||||||
 | 
					) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
 | 
				
			||||||
    # TODO: separate checks from loading
 | 
					    # TODO: separate checks from loading
 | 
				
			||||||
    raw_text = util.ensure_path(config["training"]["raw_text"])
 | 
					    raw_text = util.ensure_path(config["training"]["raw_text"])
 | 
				
			||||||
    if raw_text is not None:
 | 
					    if raw_text is not None:
 | 
				
			||||||
| 
						 | 
					@ -506,7 +498,7 @@ def verify_cli_args(
 | 
				
			||||||
    dev_path: Path,
 | 
					    dev_path: Path,
 | 
				
			||||||
    config_path: Path,
 | 
					    config_path: Path,
 | 
				
			||||||
    output_path: Optional[Path] = None,
 | 
					    output_path: Optional[Path] = None,
 | 
				
			||||||
):
 | 
					) -> None:
 | 
				
			||||||
    # Make sure all files and paths exists if they are needed
 | 
					    # Make sure all files and paths exists if they are needed
 | 
				
			||||||
    if not config_path or not config_path.exists():
 | 
					    if not config_path or not config_path.exists():
 | 
				
			||||||
        msg.fail("Config file not found", config_path, exits=1)
 | 
					        msg.fail("Config file not found", config_path, exits=1)
 | 
				
			||||||
| 
						 | 
					@ -528,12 +520,23 @@ def verify_cli_args(
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def verify_textcat_config(nlp, nlp_config):
 | 
					def verify_config(nlp: Language) -> None:
 | 
				
			||||||
 | 
					    """Perform additional checks based on the config and loaded nlp object."""
 | 
				
			||||||
 | 
					    # TODO: maybe we should validate based on the actual components, the list
 | 
				
			||||||
 | 
					    # in config["nlp"]["pipeline"] instead?
 | 
				
			||||||
 | 
					    for pipe_config in nlp.config["components"].values():
 | 
				
			||||||
 | 
					        # We can't assume that the component name == the factory
 | 
				
			||||||
 | 
					        factory = pipe_config["@factories"]
 | 
				
			||||||
 | 
					        if factory == "textcat":
 | 
				
			||||||
 | 
					            verify_textcat_config(nlp, pipe_config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
 | 
				
			||||||
    # if 'positive_label' is provided: double check whether it's in the data and
 | 
					    # if 'positive_label' is provided: double check whether it's in the data and
 | 
				
			||||||
    # the task is binary
 | 
					    # the task is binary
 | 
				
			||||||
    if nlp_config["pipeline"]["textcat"].get("positive_label", None):
 | 
					    if pipe_config.get("positive_label"):
 | 
				
			||||||
        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
 | 
					        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
 | 
				
			||||||
        pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
 | 
					        pos_label = pipe_config.get("positive_label")
 | 
				
			||||||
        if pos_label not in textcat_labels:
 | 
					        if pos_label not in textcat_labels:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
                f"The textcat's 'positive_label' config setting '{pos_label}' "
 | 
					                f"The textcat's 'positive_label' config setting '{pos_label}' "
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										102
									
								
								spacy/default_config.cfg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										102
									
								
								spacy/default_config.cfg
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,102 @@
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = null
 | 
				
			||||||
 | 
					stop_words = []
 | 
				
			||||||
 | 
					lex_attr_getters = {}
 | 
				
			||||||
 | 
					pipeline = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.tokenizer]
 | 
				
			||||||
 | 
					@tokenizers = "spacy.Tokenizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.writing_system]
 | 
				
			||||||
 | 
					direction = "ltr"
 | 
				
			||||||
 | 
					has_case = true
 | 
				
			||||||
 | 
					has_letters = true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Training hyper-parameters and additional features.
 | 
				
			||||||
 | 
					[training]
 | 
				
			||||||
 | 
					# Whether to train on sequences with 'gold standard' sentence boundaries
 | 
				
			||||||
 | 
					# and tokens. If you set this to true, take care to ensure your run-time
 | 
				
			||||||
 | 
					# data is passed in sentence-by-sentence via some prior preprocessing.
 | 
				
			||||||
 | 
					gold_preproc = false
 | 
				
			||||||
 | 
					# Limitations on training document length or number of examples.
 | 
				
			||||||
 | 
					max_length = 5000
 | 
				
			||||||
 | 
					limit = 0
 | 
				
			||||||
 | 
					# Data augmentation
 | 
				
			||||||
 | 
					orth_variant_level = 0.0
 | 
				
			||||||
 | 
					dropout = 0.1
 | 
				
			||||||
 | 
					# Controls early-stopping. 0 or -1 mean unlimited.
 | 
				
			||||||
 | 
					patience = 1600
 | 
				
			||||||
 | 
					max_epochs = 0
 | 
				
			||||||
 | 
					max_steps = 20000
 | 
				
			||||||
 | 
					eval_frequency = 200
 | 
				
			||||||
 | 
					eval_batch_size = 128
 | 
				
			||||||
 | 
					# Other settings
 | 
				
			||||||
 | 
					seed = 0
 | 
				
			||||||
 | 
					accumulate_gradient = 1
 | 
				
			||||||
 | 
					use_pytorch_for_gpu_memory = false
 | 
				
			||||||
 | 
					# Control how scores are printed and checkpoints are evaluated.
 | 
				
			||||||
 | 
					scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
 | 
				
			||||||
 | 
					score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
 | 
				
			||||||
 | 
					# These settings are invalid for the transformer models.
 | 
				
			||||||
 | 
					init_tok2vec = null
 | 
				
			||||||
 | 
					discard_oversize = false
 | 
				
			||||||
 | 
					omit_extra_lookups = false
 | 
				
			||||||
 | 
					batch_by = "sequences"
 | 
				
			||||||
 | 
					raw_text = null
 | 
				
			||||||
 | 
					tag_map = null
 | 
				
			||||||
 | 
					morph_rules = null
 | 
				
			||||||
 | 
					base_model = null
 | 
				
			||||||
 | 
					vectors = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[training.batch_size]
 | 
				
			||||||
 | 
					@schedules = "compounding.v1"
 | 
				
			||||||
 | 
					start = 1000
 | 
				
			||||||
 | 
					stop = 1000
 | 
				
			||||||
 | 
					compound = 1.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[training.optimizer]
 | 
				
			||||||
 | 
					@optimizers = "Adam.v1"
 | 
				
			||||||
 | 
					beta1 = 0.9
 | 
				
			||||||
 | 
					beta2 = 0.999
 | 
				
			||||||
 | 
					L2_is_weight_decay = true
 | 
				
			||||||
 | 
					L2 = 0.01
 | 
				
			||||||
 | 
					grad_clip = 1.0
 | 
				
			||||||
 | 
					use_averages = false
 | 
				
			||||||
 | 
					eps = 1e-8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[training.optimizer.learn_rate]
 | 
				
			||||||
 | 
					@schedules = "warmup_linear.v1"
 | 
				
			||||||
 | 
					warmup_steps = 250
 | 
				
			||||||
 | 
					total_steps = 20000
 | 
				
			||||||
 | 
					initial_rate = 0.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[pretraining]
 | 
				
			||||||
 | 
					max_epochs = 1000
 | 
				
			||||||
 | 
					min_length = 5
 | 
				
			||||||
 | 
					max_length = 500
 | 
				
			||||||
 | 
					dropout = 0.2
 | 
				
			||||||
 | 
					n_save_every = null
 | 
				
			||||||
 | 
					batch_size = 3000
 | 
				
			||||||
 | 
					seed = ${training:seed}
 | 
				
			||||||
 | 
					use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
 | 
				
			||||||
 | 
					tok2vec_model = "components.tok2vec.model"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[pretraining.objective]
 | 
				
			||||||
 | 
					type = "characters"
 | 
				
			||||||
 | 
					n_characters = 4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[pretraining.optimizer]
 | 
				
			||||||
 | 
					@optimizers = "Adam.v1"
 | 
				
			||||||
 | 
					beta1 = 0.9
 | 
				
			||||||
 | 
					beta2 = 0.999
 | 
				
			||||||
 | 
					L2_is_weight_decay = true
 | 
				
			||||||
 | 
					L2 = 0.01
 | 
				
			||||||
 | 
					grad_clip = 1.0
 | 
				
			||||||
 | 
					use_averages = true
 | 
				
			||||||
 | 
					eps = 1e-8
 | 
				
			||||||
 | 
					learn_rate = 0.001
 | 
				
			||||||
							
								
								
									
										108
									
								
								spacy/errors.py
									
									
									
									
									
								
							
							
						
						
									
										108
									
								
								spacy/errors.py
									
									
									
									
									
								
							| 
						 | 
					@ -124,20 +124,24 @@ class Warnings:
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
class Errors:
 | 
					class Errors:
 | 
				
			||||||
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
 | 
					    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
 | 
				
			||||||
    E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
 | 
					    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
 | 
				
			||||||
            "calls `nlp.create_pipe` with a component name that's not built "
 | 
					            "This usually happens when spaCy calls nlp.{method} with custom "
 | 
				
			||||||
            "in - for example, when constructing the pipeline from a model's "
 | 
					            "component name that's not registered on the current language class. "
 | 
				
			||||||
            "meta.json. If you're using a custom component, you can write to "
 | 
					            "If you're using a custom component, make sure you've added the "
 | 
				
			||||||
            "`Language.factories['{name}']` or remove it from the model meta "
 | 
					            "decorator @Language.component (for function components) or "
 | 
				
			||||||
            "and add it via `nlp.add_pipe` instead.")
 | 
					            "@Language.factory (for class components).\n\nAvailable "
 | 
				
			||||||
 | 
					            "factories: {opts}")
 | 
				
			||||||
    E003 = ("Not a valid pipeline component. Expected callable, but "
 | 
					    E003 = ("Not a valid pipeline component. Expected callable, but "
 | 
				
			||||||
            "got {component} (name: '{name}').")
 | 
					            "got {component} (name: '{name}'). If you're using a custom "
 | 
				
			||||||
    E004 = ("If you meant to add a built-in component, use `create_pipe`: "
 | 
					            "component factory, double-check that it correctly returns your "
 | 
				
			||||||
            "`nlp.add_pipe(nlp.create_pipe('{component}'))`")
 | 
					            "initialized component.")
 | 
				
			||||||
 | 
					    E004 = ("Can't set up pipeline component: a factory for '{name}' already exists.")
 | 
				
			||||||
    E005 = ("Pipeline component '{name}' returned None. If you're using a "
 | 
					    E005 = ("Pipeline component '{name}' returned None. If you're using a "
 | 
				
			||||||
            "custom component, maybe you forgot to return the processed Doc?")
 | 
					            "custom component, maybe you forgot to return the processed Doc?")
 | 
				
			||||||
    E006 = ("Invalid constraints. You can only set one of the following: "
 | 
					    E006 = ("Invalid constraints for adding pipeline component. You can only "
 | 
				
			||||||
            "before, after, first, last.")
 | 
					            "set one of the following: before (component name or index), "
 | 
				
			||||||
 | 
					            "after (component name or index), first (True) or last (True). "
 | 
				
			||||||
 | 
					            "Invalid configuration: {args}. Existing components: {opts}")
 | 
				
			||||||
    E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
 | 
					    E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
 | 
				
			||||||
    E008 = ("Some current components would be lost when restoring previous "
 | 
					    E008 = ("Some current components would be lost when restoring previous "
 | 
				
			||||||
            "pipeline state. If you added components after calling "
 | 
					            "pipeline state. If you added components after calling "
 | 
				
			||||||
| 
						 | 
					@ -184,7 +188,7 @@ class Errors:
 | 
				
			||||||
            "the documentation:\nhttps://spacy.io/usage/models")
 | 
					            "the documentation:\nhttps://spacy.io/usage/models")
 | 
				
			||||||
    E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
 | 
					    E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
 | 
				
			||||||
            "component to the pipeline with: "
 | 
					            "component to the pipeline with: "
 | 
				
			||||||
            "nlp.add_pipe(nlp.create_pipe('sentencizer')). "
 | 
					            "nlp.add_pipe('sentencizer'). "
 | 
				
			||||||
            "Alternatively, add the dependency parser, or set sentence "
 | 
					            "Alternatively, add the dependency parser, or set sentence "
 | 
				
			||||||
            "boundaries by setting doc[i].is_sent_start.")
 | 
					            "boundaries by setting doc[i].is_sent_start.")
 | 
				
			||||||
    E031 = ("Invalid token: empty string ('') at position {i}.")
 | 
					    E031 = ("Invalid token: empty string ('') at position {i}.")
 | 
				
			||||||
| 
						 | 
					@ -365,8 +369,6 @@ class Errors:
 | 
				
			||||||
    E133 = ("The sum of prior probabilities for alias '{alias}' should not "
 | 
					    E133 = ("The sum of prior probabilities for alias '{alias}' should not "
 | 
				
			||||||
            "exceed 1, but found {sum}.")
 | 
					            "exceed 1, but found {sum}.")
 | 
				
			||||||
    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
 | 
					    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
 | 
				
			||||||
    E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
 | 
					 | 
				
			||||||
            "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
 | 
					 | 
				
			||||||
    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
 | 
					    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
 | 
				
			||||||
            "to provide a valid JSON object as input with either the `text` "
 | 
					            "to provide a valid JSON object as input with either the `text` "
 | 
				
			||||||
            "or `tokens` key. For more info, see the docs:\n"
 | 
					            "or `tokens` key. For more info, see the docs:\n"
 | 
				
			||||||
| 
						 | 
					@ -484,6 +486,62 @@ class Errors:
 | 
				
			||||||
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
					    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
 | 
					    E956 = ("Can't find component '{name}' in [components] block in the config. "
 | 
				
			||||||
 | 
					            "Available components: {opts}")
 | 
				
			||||||
 | 
					    E957 = ("Writing directly to Language.factories isn't needed anymore in "
 | 
				
			||||||
 | 
					            "spaCy v3. Instead, you can use the @Language.factory decorator "
 | 
				
			||||||
 | 
					            "to register your custom component factory or @Language.component "
 | 
				
			||||||
 | 
					            "to register a simple stateless function component that just takes "
 | 
				
			||||||
 | 
					            "a Doc and returns it.")
 | 
				
			||||||
 | 
					    E958 = ("Language code defined in config ({bad_lang_code}) does not match "
 | 
				
			||||||
 | 
					            "language code of current Language subclass {lang} ({lang_code})")
 | 
				
			||||||
 | 
					    E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
 | 
				
			||||||
 | 
					    E960 = ("No config data found for component '{name}'. This is likely a bug "
 | 
				
			||||||
 | 
					            "in spaCy.")
 | 
				
			||||||
 | 
					    E961 = ("Found non-serializable Python object in config. Configs should "
 | 
				
			||||||
 | 
					            "only include values that can be serialized to JSON. If you need "
 | 
				
			||||||
 | 
					            "to pass models or other objects to your component, use a reference "
 | 
				
			||||||
 | 
					            "to a registered function or initialize the object in your "
 | 
				
			||||||
 | 
					            "component.\n\n{config}")
 | 
				
			||||||
 | 
					    E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
 | 
				
			||||||
 | 
					            "got: {cfg_type}.")
 | 
				
			||||||
 | 
					    E963 = ("Can't read component info from @Language.{decorator} decorator. "
 | 
				
			||||||
 | 
					            "Maybe you forgot to call it? Make sure you're using "
 | 
				
			||||||
 | 
					            "@Language.{decorator}() instead of @Language.{decorator}.")
 | 
				
			||||||
 | 
					    E964 = ("The pipeline component factory for '{name}' needs to have the "
 | 
				
			||||||
 | 
					            "following named arguments, which are passed in by spaCy:\n- nlp: "
 | 
				
			||||||
 | 
					            "receives the current nlp object and lets you access the vocab\n- "
 | 
				
			||||||
 | 
					            "name: the name of the component instance, can be used to identify "
 | 
				
			||||||
 | 
					            "the component, output losses etc.")
 | 
				
			||||||
 | 
					    E965 = ("It looks like you're using the @Language.component decorator to "
 | 
				
			||||||
 | 
					            "register '{name}' on a class instead of a function component. If "
 | 
				
			||||||
 | 
					            "you need to register a class or function that *returns* a component "
 | 
				
			||||||
 | 
					            "function, use the @Language.factory decorator instead.")
 | 
				
			||||||
 | 
					    E966 = ("nlp.add_pipe now takes the string name of the registered component "
 | 
				
			||||||
 | 
					            "factory, not a callable component. Expected string, but got "
 | 
				
			||||||
 | 
					            "{component} (name: '{name}').\n\n- If you created your component "
 | 
				
			||||||
 | 
					            "with nlp.create_pipe('name'): remove nlp.create_pipe and call "
 | 
				
			||||||
 | 
					            "nlp.add_pipe('name') instead.\n\n- If you passed in a component "
 | 
				
			||||||
 | 
					            "like TextCategorizer(): call nlp.add_pipe with the string name "
 | 
				
			||||||
 | 
					            "instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom "
 | 
				
			||||||
 | 
					            "component: Add the decorator @Language.component (for function "
 | 
				
			||||||
 | 
					            "components) or @Language.factory (for class components / factories) "
 | 
				
			||||||
 | 
					            "to your custom component and assign it a name, e.g. "
 | 
				
			||||||
 | 
					            "@Language.component('your_name'). You can then run "
 | 
				
			||||||
 | 
					            "nlp.add_pipe('your_name') to add it to the pipeline.")
 | 
				
			||||||
 | 
					    E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
 | 
				
			||||||
 | 
					    E968 = ("nlp.replace_pipe now takes the string name of the registered component "
 | 
				
			||||||
 | 
					            "factory, not a callable component. Expected string, but got "
 | 
				
			||||||
 | 
					            "{component}.\n\n- If you created your component with"
 | 
				
			||||||
 | 
					            "with nlp.create_pipe('name'): remove nlp.create_pipe and call "
 | 
				
			||||||
 | 
					            "nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a "
 | 
				
			||||||
 | 
					            "component like TextCategorizer(): call nlp.replace_pipe with the "
 | 
				
			||||||
 | 
					            "string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n"
 | 
				
			||||||
 | 
					            "- If you're using a custom component: Add the decorator "
 | 
				
			||||||
 | 
					            "@Language.component (for function components) or @Language.factory "
 | 
				
			||||||
 | 
					            "(for class components / factories) to your custom component and "
 | 
				
			||||||
 | 
					            "assign it a name, e.g. @Language.component('your_name'). You can "
 | 
				
			||||||
 | 
					            "then run nlp.replace_pipe('{name}', 'your_name').")
 | 
				
			||||||
    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
 | 
					    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
 | 
				
			||||||
    E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
 | 
					    E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
 | 
				
			||||||
    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
 | 
					    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
 | 
				
			||||||
| 
						 | 
					@ -506,10 +564,12 @@ class Errors:
 | 
				
			||||||
            "into {values}, but found {value}.")
 | 
					            "into {values}, but found {value}.")
 | 
				
			||||||
    E983 = ("Invalid key for '{dict}': {key}. Available keys: "
 | 
					    E983 = ("Invalid key for '{dict}': {key}. Available keys: "
 | 
				
			||||||
            "{keys}")
 | 
					            "{keys}")
 | 
				
			||||||
    E985 = ("The pipeline component '{component}' is already available in the base "
 | 
					    E984 = ("Invalid component config for '{name}': no @factories key "
 | 
				
			||||||
            "model. The settings in the component block in the config file are "
 | 
					            "specifying the registered function used to initialize the "
 | 
				
			||||||
            "being ignored. If you want to replace this component instead, set "
 | 
					            "component. For example, @factories = \"ner\" will use the 'ner' "
 | 
				
			||||||
            "'replace' to True in the training configuration.")
 | 
					            "factory and all other settings in the block will be passed "
 | 
				
			||||||
 | 
					            "to it as arguments.\n\n{config}")
 | 
				
			||||||
 | 
					    E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
 | 
				
			||||||
    E986 = ("Could not create any training batches: check your input. "
 | 
					    E986 = ("Could not create any training batches: check your input. "
 | 
				
			||||||
            "Perhaps discard_oversize should be set to False ?")
 | 
					            "Perhaps discard_oversize should be set to False ?")
 | 
				
			||||||
    E987 = ("The text of an example training instance is either a Doc or "
 | 
					    E987 = ("The text of an example training instance is either a Doc or "
 | 
				
			||||||
| 
						 | 
					@ -530,9 +590,9 @@ class Errors:
 | 
				
			||||||
    E992 = ("The function `select_pipes` was called with `enable`={enable} "
 | 
					    E992 = ("The function `select_pipes` was called with `enable`={enable} "
 | 
				
			||||||
            "and `disable`={disable} but that information is conflicting "
 | 
					            "and `disable`={disable} but that information is conflicting "
 | 
				
			||||||
            "for the `nlp` pipeline with components {names}.")
 | 
					            "for the `nlp` pipeline with components {names}.")
 | 
				
			||||||
    E993 = ("The config for 'nlp' should include either a key 'name' to "
 | 
					    E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
 | 
				
			||||||
            "refer to an existing model by name or path, or a key 'lang' "
 | 
					            "the code of the language to initialize it with (for example "
 | 
				
			||||||
            "to create a new blank model.")
 | 
					            "'en' for English).\n\n{config}")
 | 
				
			||||||
    E996 = ("Could not parse {file}: {msg}")
 | 
					    E996 = ("Could not parse {file}: {msg}")
 | 
				
			||||||
    E997 = ("Tokenizer special cases are not allowed to modify the text. "
 | 
					    E997 = ("Tokenizer special cases are not allowed to modify the text. "
 | 
				
			||||||
            "This would map '{chunk}' to '{orth}' given token attributes "
 | 
					            "This would map '{chunk}' to '{orth}' given token attributes "
 | 
				
			||||||
| 
						 | 
					@ -540,9 +600,9 @@ class Errors:
 | 
				
			||||||
    E999 = ("Unable to merge the `Doc` objects because they do not all share "
 | 
					    E999 = ("Unable to merge the `Doc` objects because they do not all share "
 | 
				
			||||||
            "the same `Vocab`.")
 | 
					            "the same `Vocab`.")
 | 
				
			||||||
    E1000 = ("No pkuseg model available. Provide a pkuseg model when "
 | 
					    E1000 = ("No pkuseg model available. Provide a pkuseg model when "
 | 
				
			||||||
            "initializing the pipeline: "
 | 
					             "initializing the pipeline:\n"
 | 
				
			||||||
            '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
 | 
					             'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
 | 
				
			||||||
            'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
 | 
					             'nlp = Chinese(config=cfg)')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,10 +1,9 @@
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .conll_ner2docs import n_sents_info
 | 
					from .conll_ner2docs import n_sents_info
 | 
				
			||||||
from ...gold import Example
 | 
					 | 
				
			||||||
from ...gold import iob_to_biluo, spans_from_biluo_tags
 | 
					from ...gold import iob_to_biluo, spans_from_biluo_tags
 | 
				
			||||||
from ...language import Language
 | 
					 | 
				
			||||||
from ...tokens import Doc, Token, Span
 | 
					from ...tokens import Doc, Token, Span
 | 
				
			||||||
 | 
					from ...vocab import Vocab
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -73,7 +72,7 @@ def read_conllx(
 | 
				
			||||||
    ner_map=None,
 | 
					    ner_map=None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """ Yield docs, one for each sentence """
 | 
					    """ Yield docs, one for each sentence """
 | 
				
			||||||
    vocab = Language.Defaults.create_vocab()  # need vocab to make a minimal Doc
 | 
					    vocab = Vocab()  # need vocab to make a minimal Doc
 | 
				
			||||||
    for sent in input_data.strip().split("\n\n"):
 | 
					    for sent in input_data.strip().split("\n\n"):
 | 
				
			||||||
        lines = sent.strip().split("\n")
 | 
					        lines = sent.strip().split("\n")
 | 
				
			||||||
        if lines:
 | 
					        if lines:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AfrikaansDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "af"
 | 
					lang = "af"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					stop_words = {"@language_data": "spacy.af.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.af.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Afrikaans(Language):
 | 
					class Afrikaans(Language):
 | 
				
			||||||
    lang = "af"
 | 
					    lang = "af"
 | 
				
			||||||
    Defaults = AfrikaansDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Afrikaans"]
 | 
					__all__ = ["Afrikaans"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,31 +1,48 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "ar"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.ar.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.writing_system]
 | 
				
			||||||
 | 
					direction = "rtl"
 | 
				
			||||||
 | 
					has_case = false
 | 
				
			||||||
 | 
					has_letters = true
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ar.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ar.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ArabicDefaults(Language.Defaults):
 | 
					class ArabicDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "ar"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Arabic(Language):
 | 
					class Arabic(Language):
 | 
				
			||||||
    lang = "ar"
 | 
					    lang = "ar"
 | 
				
			||||||
    Defaults = ArabicDefaults
 | 
					    Defaults = ArabicDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Arabic"]
 | 
					__all__ = ["Arabic"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BulgarianDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "bg"
 | 
					lang = "bg"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					stop_words = {"@language_data": "spacy.bg.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.bg.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Bulgarian(Language):
 | 
					class Bulgarian(Language):
 | 
				
			||||||
    lang = "bg"
 | 
					    lang = "bg"
 | 
				
			||||||
    Defaults = BulgarianDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Bulgarian"]
 | 
					__all__ = ["Bulgarian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,35 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "bn"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.bn.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.bn.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BengaliDefaults(Language.Defaults):
 | 
					class BengaliDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "bn"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
| 
						 | 
					@ -21,6 +38,7 @@ class BengaliDefaults(Language.Defaults):
 | 
				
			||||||
class Bengali(Language):
 | 
					class Bengali(Language):
 | 
				
			||||||
    lang = "bn"
 | 
					    lang = "bn"
 | 
				
			||||||
    Defaults = BengaliDefaults
 | 
					    Defaults = BengaliDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Bengali"]
 | 
					__all__ = ["Bengali"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,31 +1,49 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "ca"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.ca.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ca.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ca.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CatalanDefaults(Language.Defaults):
 | 
					class CatalanDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "ca"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Catalan(Language):
 | 
					class Catalan(Language):
 | 
				
			||||||
    lang = "ca"
 | 
					    lang = "ca"
 | 
				
			||||||
    Defaults = CatalanDefaults
 | 
					    Defaults = CatalanDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Catalan"]
 | 
					__all__ = ["Catalan"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CzechDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "cs"
 | 
					lang = "cs"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					stop_words = {"@language_data": "spacy.cs.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.cs.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Czech(Language):
 | 
					class Czech(Language):
 | 
				
			||||||
    lang = "cs"
 | 
					    lang = "cs"
 | 
				
			||||||
    Defaults = CzechDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Czech"]
 | 
					__all__ = ["Czech"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,27 +1,50 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "da"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.da.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.da.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.da.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DanishDefaults(Language.Defaults):
 | 
					class DanishDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "da"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Danish(Language):
 | 
					class Danish(Language):
 | 
				
			||||||
    lang = "da"
 | 
					    lang = "da"
 | 
				
			||||||
    Defaults = DanishDefaults
 | 
					    Defaults = DanishDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Danish"]
 | 
					__all__ = ["Danish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,23 +1,40 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "de"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.de.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.de.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GermanDefaults(Language.Defaults):
 | 
					class GermanDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "de"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
    single_orth_variants = [
 | 
					    single_orth_variants = [
 | 
				
			||||||
        {"tags": ["$("], "variants": ["…", "..."]},
 | 
					        {"tags": ["$("], "variants": ["…", "..."]},
 | 
				
			||||||
| 
						 | 
					@ -38,6 +55,7 @@ class GermanDefaults(Language.Defaults):
 | 
				
			||||||
class German(Language):
 | 
					class German(Language):
 | 
				
			||||||
    lang = "de"
 | 
					    lang = "de"
 | 
				
			||||||
    Defaults = GermanDefaults
 | 
					    Defaults = GermanDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["German"]
 | 
					__all__ = ["German"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,6 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
| 
						 | 
					@ -6,32 +9,51 @@ from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...lookups import Lookups
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...attrs import LANG
 | 
					
 | 
				
			||||||
from ...util import update_exc
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "el"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.el.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.GreekLemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.lemmatizers("spacy.GreekLemmatizer.v1")
 | 
				
			||||||
 | 
					def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
 | 
				
			||||||
 | 
					    return GreekLemmatizer(data_paths=data_paths)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.el.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.el.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GreekDefaults(Language.Defaults):
 | 
					class GreekDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "el"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
					 | 
				
			||||||
        if lookups is None:
 | 
					 | 
				
			||||||
            lookups = Lookups()
 | 
					 | 
				
			||||||
        return GreekLemmatizer(lookups)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Greek(Language):
 | 
					class Greek(Language):
 | 
				
			||||||
    lang = "el"
 | 
					    lang = "el"
 | 
				
			||||||
    Defaults = GreekDefaults
 | 
					    Defaults = GreekDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Greek"]
 | 
					__all__ = ["Greek"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,5 @@
 | 
				
			||||||
 | 
					from typing import Dict, List
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...lemmatizer import Lemmatizer
 | 
					from ...lemmatizer import Lemmatizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,7 +14,13 @@ class GreekLemmatizer(Lemmatizer):
 | 
				
			||||||
    not applicable for Greek language.
 | 
					    not applicable for Greek language.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lemmatize(self, string, index, exceptions, rules):
 | 
					    def lemmatize(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        string: str,
 | 
				
			||||||
 | 
					        index: Dict[str, List[str]],
 | 
				
			||||||
 | 
					        exceptions: Dict[str, Dict[str, List[str]]],
 | 
				
			||||||
 | 
					        rules: Dict[str, List[List[str]]],
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        string = string.lower()
 | 
					        string = string.lower()
 | 
				
			||||||
        forms = []
 | 
					        forms = []
 | 
				
			||||||
        if string in index:
 | 
					        if string in index:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,25 +1,50 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from .lemmatizer import is_base_form
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...lemmatizer import Lemmatizer
 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _return_en(_):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    return "en"
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "en"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.en.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.EnglishLemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.en.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.en.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
 | 
				
			||||||
 | 
					def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
 | 
				
			||||||
 | 
					    return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class EnglishDefaults(Language.Defaults):
 | 
					class EnglishDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = _return_en
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    single_orth_variants = [
 | 
					    single_orth_variants = [
 | 
				
			||||||
| 
						 | 
					@ -31,45 +56,11 @@ class EnglishDefaults(Language.Defaults):
 | 
				
			||||||
        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
 | 
					        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def is_base_form(cls, univ_pos, morphology=None):
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        Check whether we're dealing with an uninflected paradigm, so we can
 | 
					 | 
				
			||||||
        avoid lemmatization entirely.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        univ_pos (unicode / int): The token's universal part-of-speech tag.
 | 
					 | 
				
			||||||
        morphology (dict): The token's morphological features following the
 | 
					 | 
				
			||||||
            Universal Dependencies scheme.
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        if morphology is None:
 | 
					 | 
				
			||||||
            morphology = {}
 | 
					 | 
				
			||||||
        if univ_pos == "noun" and morphology.get("Number") == "sing":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
 | 
					 | 
				
			||||||
        # morphology
 | 
					 | 
				
			||||||
        elif univ_pos == "verb" and (
 | 
					 | 
				
			||||||
            morphology.get("VerbForm") == "fin"
 | 
					 | 
				
			||||||
            and morphology.get("Tense") == "pres"
 | 
					 | 
				
			||||||
            and morphology.get("Number") is None
 | 
					 | 
				
			||||||
        ):
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        elif morphology.get("VerbForm") == "inf":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        elif morphology.get("VerbForm") == "none":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        elif morphology.get("Degree") == "pos":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
class English(Language):
 | 
					class English(Language):
 | 
				
			||||||
    lang = "en"
 | 
					    lang = "en"
 | 
				
			||||||
    Defaults = EnglishDefaults
 | 
					    Defaults = EnglishDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["English"]
 | 
					__all__ = ["English"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										36
									
								
								spacy/lang/en/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								spacy/lang/en/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,36 @@
 | 
				
			||||||
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Check whether we're dealing with an uninflected paradigm, so we can
 | 
				
			||||||
 | 
					    avoid lemmatization entirely.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    univ_pos (unicode / int): The token's universal part-of-speech tag.
 | 
				
			||||||
 | 
					    morphology (dict): The token's morphological features following the
 | 
				
			||||||
 | 
					        Universal Dependencies scheme.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if morphology is None:
 | 
				
			||||||
 | 
					        morphology = {}
 | 
				
			||||||
 | 
					    if univ_pos == "noun" and morphology.get("Number") == "sing":
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    # This maps 'VBP' to base form -- probably just need 'IS_BASE'
 | 
				
			||||||
 | 
					    # morphology
 | 
				
			||||||
 | 
					    elif univ_pos == "verb" and (
 | 
				
			||||||
 | 
					        morphology.get("VerbForm") == "fin"
 | 
				
			||||||
 | 
					        and morphology.get("Tense") == "pres"
 | 
				
			||||||
 | 
					        and morphology.get("Number") is None
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    elif univ_pos == "adj" and morphology.get("Degree") == "pos":
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    elif morphology.get("VerbForm") == "inf":
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    elif morphology.get("VerbForm") == "none":
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    elif morphology.get("Degree") == "pos":
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
| 
						 | 
					@ -1,47 +1,17 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# fmt: off
 | 
				
			||||||
_num_words = [
 | 
					_num_words = [
 | 
				
			||||||
    "zero",
 | 
					    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
 | 
				
			||||||
    "one",
 | 
					    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
 | 
				
			||||||
    "two",
 | 
					    "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
 | 
				
			||||||
    "three",
 | 
					    "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
 | 
				
			||||||
    "four",
 | 
					    "million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
 | 
				
			||||||
    "five",
 | 
					 | 
				
			||||||
    "six",
 | 
					 | 
				
			||||||
    "seven",
 | 
					 | 
				
			||||||
    "eight",
 | 
					 | 
				
			||||||
    "nine",
 | 
					 | 
				
			||||||
    "ten",
 | 
					 | 
				
			||||||
    "eleven",
 | 
					 | 
				
			||||||
    "twelve",
 | 
					 | 
				
			||||||
    "thirteen",
 | 
					 | 
				
			||||||
    "fourteen",
 | 
					 | 
				
			||||||
    "fifteen",
 | 
					 | 
				
			||||||
    "sixteen",
 | 
					 | 
				
			||||||
    "seventeen",
 | 
					 | 
				
			||||||
    "eighteen",
 | 
					 | 
				
			||||||
    "nineteen",
 | 
					 | 
				
			||||||
    "twenty",
 | 
					 | 
				
			||||||
    "thirty",
 | 
					 | 
				
			||||||
    "forty",
 | 
					 | 
				
			||||||
    "fifty",
 | 
					 | 
				
			||||||
    "sixty",
 | 
					 | 
				
			||||||
    "seventy",
 | 
					 | 
				
			||||||
    "eighty",
 | 
					 | 
				
			||||||
    "ninety",
 | 
					 | 
				
			||||||
    "hundred",
 | 
					 | 
				
			||||||
    "thousand",
 | 
					 | 
				
			||||||
    "million",
 | 
					 | 
				
			||||||
    "billion",
 | 
					 | 
				
			||||||
    "trillion",
 | 
					 | 
				
			||||||
    "quadrillion",
 | 
					 | 
				
			||||||
    "gajillion",
 | 
					 | 
				
			||||||
    "bazillion",
 | 
					 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					# fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def like_num(text):
 | 
					def like_num(text: str) -> bool:
 | 
				
			||||||
    if text.startswith(("+", "-", "±", "~")):
 | 
					    if text.startswith(("+", "-", "±", "~")):
 | 
				
			||||||
        text = text[1:]
 | 
					        text = text[1:]
 | 
				
			||||||
    text = text.replace(",", "").replace(".", "")
 | 
					    text = text.replace(",", "").replace(".", "")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,52 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.config import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "es"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.es.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.es.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.es.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SpanishDefaults(Language.Defaults):
 | 
					class SpanishDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "es"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Spanish(Language):
 | 
					class Spanish(Language):
 | 
				
			||||||
    lang = "es"
 | 
					    lang = "es"
 | 
				
			||||||
    Defaults = SpanishDefaults
 | 
					    Defaults = SpanishDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Spanish"]
 | 
					__all__ = ["Spanish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class EstonianDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "et"
 | 
					lang = "et"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					stop_words = {"@language_data": "spacy.et.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.et.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Estonian(Language):
 | 
					class Estonian(Language):
 | 
				
			||||||
    lang = "et"
 | 
					    lang = "et"
 | 
				
			||||||
    Defaults = EstonianDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Estonian"]
 | 
					__all__ = ["Estonian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,25 +1,41 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "eu"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.eu.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.eu.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.eu.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BasqueDefaults(Language.Defaults):
 | 
					class BasqueDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "eu"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
					    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Basque(Language):
 | 
					class Basque(Language):
 | 
				
			||||||
    lang = "eu"
 | 
					    lang = "eu"
 | 
				
			||||||
    Defaults = BasqueDefaults
 | 
					    Defaults = BasqueDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Basque"]
 | 
					__all__ = ["Basque"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,8 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
| 
						 | 
					@ -9,23 +10,46 @@ from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "fa"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.fa.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.writing_system]
 | 
				
			||||||
 | 
					direction = "rtl"
 | 
				
			||||||
 | 
					has_case = false
 | 
				
			||||||
 | 
					has_letters = true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.fa.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.fa.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PersianDefaults(Language.Defaults):
 | 
					class PersianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "fa"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
					 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Persian(Language):
 | 
					class Persian(Language):
 | 
				
			||||||
    lang = "fa"
 | 
					    lang = "fa"
 | 
				
			||||||
    Defaults = PersianDefaults
 | 
					    Defaults = PersianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Persian"]
 | 
					__all__ = ["Persian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,31 +1,43 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "fi"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.fi.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.fi.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.fi.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class FinnishDefaults(Language.Defaults):
 | 
					class FinnishDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "fi"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Finnish(Language):
 | 
					class Finnish(Language):
 | 
				
			||||||
    lang = "fi"
 | 
					    lang = "fi"
 | 
				
			||||||
    Defaults = FinnishDefaults
 | 
					    Defaults = FinnishDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Finnish"]
 | 
					__all__ = ["Finnish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,44 +1,61 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .lemmatizer import FrenchLemmatizer
 | 
					from .lemmatizer import FrenchLemmatizer, is_base_form
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...lookups import Lookups
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "fr"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.fr.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.FrenchLemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
 | 
				
			||||||
 | 
					def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
 | 
				
			||||||
 | 
					    return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.fr.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.fr.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class FrenchDefaults(Language.Defaults):
 | 
					class FrenchDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "fr"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    token_match = TOKEN_MATCH
 | 
					    token_match = TOKEN_MATCH
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
					 | 
				
			||||||
        if lookups is None:
 | 
					 | 
				
			||||||
            lookups = Lookups()
 | 
					 | 
				
			||||||
        return FrenchLemmatizer(lookups)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
class French(Language):
 | 
					class French(Language):
 | 
				
			||||||
    lang = "fr"
 | 
					    lang = "fr"
 | 
				
			||||||
    Defaults = FrenchDefaults
 | 
					    Defaults = FrenchDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["French"]
 | 
					__all__ = ["French"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,5 @@
 | 
				
			||||||
 | 
					from typing import Optional, List, Dict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...lemmatizer import Lemmatizer
 | 
					from ...lemmatizer import Lemmatizer
 | 
				
			||||||
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
 | 
					from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
 | 
				
			||||||
from ...symbols import SCONJ, CCONJ
 | 
					from ...symbols import SCONJ, CCONJ
 | 
				
			||||||
| 
						 | 
					@ -13,7 +15,9 @@ class FrenchLemmatizer(Lemmatizer):
 | 
				
			||||||
    the lookup table.
 | 
					    the lookup table.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
					    def __call__(
 | 
				
			||||||
 | 
					        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
					        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
				
			||||||
        if "lemma_rules" not in self.lookups:
 | 
					        if "lemma_rules" not in self.lookups:
 | 
				
			||||||
            return [lookup_table.get(string, string)]
 | 
					            return [lookup_table.get(string, string)]
 | 
				
			||||||
| 
						 | 
					@ -52,7 +56,47 @@ class FrenchLemmatizer(Lemmatizer):
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        return lemmas
 | 
					        return lemmas
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def is_base_form(self, univ_pos, morphology=None):
 | 
					    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
				
			||||||
 | 
					        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
				
			||||||
 | 
					        if orth is not None and orth in lookup_table:
 | 
				
			||||||
 | 
					            return lookup_table[orth][0]
 | 
				
			||||||
 | 
					        return string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def lemmatize(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        string: str,
 | 
				
			||||||
 | 
					        index: Dict[str, List[str]],
 | 
				
			||||||
 | 
					        exceptions: Dict[str, Dict[str, List[str]]],
 | 
				
			||||||
 | 
					        rules: Dict[str, List[List[str]]],
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
 | 
					        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
				
			||||||
 | 
					        string = string.lower()
 | 
				
			||||||
 | 
					        forms = []
 | 
				
			||||||
 | 
					        if string in index:
 | 
				
			||||||
 | 
					            forms.append(string)
 | 
				
			||||||
 | 
					            return forms
 | 
				
			||||||
 | 
					        forms.extend(exceptions.get(string, []))
 | 
				
			||||||
 | 
					        oov_forms = []
 | 
				
			||||||
 | 
					        if not forms:
 | 
				
			||||||
 | 
					            for old, new in rules:
 | 
				
			||||||
 | 
					                if string.endswith(old):
 | 
				
			||||||
 | 
					                    form = string[: len(string) - len(old)] + new
 | 
				
			||||||
 | 
					                    if not form:
 | 
				
			||||||
 | 
					                        pass
 | 
				
			||||||
 | 
					                    elif form in index or not form.isalpha():
 | 
				
			||||||
 | 
					                        forms.append(form)
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        oov_forms.append(form)
 | 
				
			||||||
 | 
					        if not forms:
 | 
				
			||||||
 | 
					            forms.extend(oov_forms)
 | 
				
			||||||
 | 
					        if not forms and string in lookup_table.keys():
 | 
				
			||||||
 | 
					            forms.append(lookup_table[string][0])
 | 
				
			||||||
 | 
					        if not forms:
 | 
				
			||||||
 | 
					            forms.append(string)
 | 
				
			||||||
 | 
					        return list(set(forms))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Check whether we're dealing with an uninflected paradigm, so we can
 | 
					    Check whether we're dealing with an uninflected paradigm, so we can
 | 
				
			||||||
    avoid lemmatization entirely.
 | 
					    avoid lemmatization entirely.
 | 
				
			||||||
| 
						 | 
					@ -88,48 +132,3 @@ class FrenchLemmatizer(Lemmatizer):
 | 
				
			||||||
        return True
 | 
					        return True
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def noun(self, string, morphology=None):
 | 
					 | 
				
			||||||
        return self(string, "noun", morphology)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def verb(self, string, morphology=None):
 | 
					 | 
				
			||||||
        return self(string, "verb", morphology)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def adj(self, string, morphology=None):
 | 
					 | 
				
			||||||
        return self(string, "adj", morphology)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def punct(self, string, morphology=None):
 | 
					 | 
				
			||||||
        return self(string, "punct", morphology)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def lookup(self, string, orth=None):
 | 
					 | 
				
			||||||
        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
					 | 
				
			||||||
        if orth is not None and orth in lookup_table:
 | 
					 | 
				
			||||||
            return lookup_table[orth][0]
 | 
					 | 
				
			||||||
        return string
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def lemmatize(self, string, index, exceptions, rules):
 | 
					 | 
				
			||||||
        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
					 | 
				
			||||||
        string = string.lower()
 | 
					 | 
				
			||||||
        forms = []
 | 
					 | 
				
			||||||
        if string in index:
 | 
					 | 
				
			||||||
            forms.append(string)
 | 
					 | 
				
			||||||
            return forms
 | 
					 | 
				
			||||||
        forms.extend(exceptions.get(string, []))
 | 
					 | 
				
			||||||
        oov_forms = []
 | 
					 | 
				
			||||||
        if not forms:
 | 
					 | 
				
			||||||
            for old, new in rules:
 | 
					 | 
				
			||||||
                if string.endswith(old):
 | 
					 | 
				
			||||||
                    form = string[: len(string) - len(old)] + new
 | 
					 | 
				
			||||||
                    if not form:
 | 
					 | 
				
			||||||
                        pass
 | 
					 | 
				
			||||||
                    elif form in index or not form.isalpha():
 | 
					 | 
				
			||||||
                        forms.append(form)
 | 
					 | 
				
			||||||
                    else:
 | 
					 | 
				
			||||||
                        oov_forms.append(form)
 | 
					 | 
				
			||||||
        if not forms:
 | 
					 | 
				
			||||||
            forms.extend(oov_forms)
 | 
					 | 
				
			||||||
        if not forms and string in lookup_table.keys():
 | 
					 | 
				
			||||||
            forms.append(lookup_table[string][0])
 | 
					 | 
				
			||||||
        if not forms:
 | 
					 | 
				
			||||||
            forms.append(string)
 | 
					 | 
				
			||||||
        return list(set(forms))
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,23 +1,33 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "ga"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.ga.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ga.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class IrishDefaults(Language.Defaults):
 | 
					class IrishDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "ga"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = set(STOP_WORDS)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Irish(Language):
 | 
					class Irish(Language):
 | 
				
			||||||
    lang = "ga"
 | 
					    lang = "ga"
 | 
				
			||||||
    Defaults = IrishDefaults
 | 
					    Defaults = IrishDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Irish"]
 | 
					__all__ = ["Irish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GujaratiDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "gu"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.gu.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.gu.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Gujarati(Language):
 | 
					class Gujarati(Language):
 | 
				
			||||||
    lang = "gu"
 | 
					    lang = "gu"
 | 
				
			||||||
    Defaults = GujaratiDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Gujarati"]
 | 
					__all__ = ["Gujarati"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,22 +1,37 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "he"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.he.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.writing_system]
 | 
				
			||||||
 | 
					direction = "rtl"
 | 
				
			||||||
 | 
					has_case = false
 | 
				
			||||||
 | 
					has_letters = true
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.he.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class HebrewDefaults(Language.Defaults):
 | 
					class HebrewDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "he"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Hebrew(Language):
 | 
					class Hebrew(Language):
 | 
				
			||||||
    lang = "he"
 | 
					    lang = "he"
 | 
				
			||||||
    Defaults = HebrewDefaults
 | 
					    Defaults = HebrewDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Hebrew"]
 | 
					__all__ = ["Hebrew"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,33 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class HindiDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					lang = "hi"
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "hi"
 | 
					stop_words = {"@language_data": "spacy.hi.stop_words"}
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.hi.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.hi.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Hindi(Language):
 | 
					class Hindi(Language):
 | 
				
			||||||
    lang = "hi"
 | 
					    lang = "hi"
 | 
				
			||||||
    Defaults = HindiDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Hindi"]
 | 
					__all__ = ["Hindi"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,25 +1,39 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "hr"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.hr.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.hr.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CroatianDefaults(Language.Defaults):
 | 
					class CroatianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "hr"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Croatian(Language):
 | 
					class Croatian(Language):
 | 
				
			||||||
    lang = "hr"
 | 
					    lang = "hr"
 | 
				
			||||||
    Defaults = CroatianDefaults
 | 
					    Defaults = CroatianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Croatian"]
 | 
					__all__ = ["Croatian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,22 +1,35 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "hu"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.hu.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.hu.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class HungarianDefaults(Language.Defaults):
 | 
					class HungarianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "hu"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
| 
						 | 
					@ -26,6 +39,7 @@ class HungarianDefaults(Language.Defaults):
 | 
				
			||||||
class Hungarian(Language):
 | 
					class Hungarian(Language):
 | 
				
			||||||
    lang = "hu"
 | 
					    lang = "hu"
 | 
				
			||||||
    Defaults = HungarianDefaults
 | 
					    Defaults = HungarianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Hungarian"]
 | 
					__all__ = ["Hungarian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,21 +1,33 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...attrs import LANG
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ArmenianDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "hy"
 | 
					lang = "hy"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.hy.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					@registry.language_data("spacy.hy.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.hy.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Armenian(Language):
 | 
					class Armenian(Language):
 | 
				
			||||||
    lang = "hy"
 | 
					    lang = "hy"
 | 
				
			||||||
    Defaults = ArmenianDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Armenian"]
 | 
					__all__ = ["Armenian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,21 +1,43 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.config import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "id"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.id.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.id.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.id.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class IndonesianDefaults(Language.Defaults):
 | 
					class IndonesianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "id"
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
| 
						 | 
					@ -25,6 +47,7 @@ class IndonesianDefaults(Language.Defaults):
 | 
				
			||||||
class Indonesian(Language):
 | 
					class Indonesian(Language):
 | 
				
			||||||
    lang = "id"
 | 
					    lang = "id"
 | 
				
			||||||
    Defaults = IndonesianDefaults
 | 
					    Defaults = IndonesianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Indonesian"]
 | 
					__all__ = ["Indonesian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class IcelandicDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "is"
 | 
					lang = "is"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					stop_words = {"@language_data": "spacy.is.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.is.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Icelandic(Language):
 | 
					class Icelandic(Language):
 | 
				
			||||||
    lang = "is"
 | 
					    lang = "is"
 | 
				
			||||||
    Defaults = IcelandicDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Icelandic"]
 | 
					__all__ = ["Icelandic"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,34 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "it"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.it.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.it.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ItalianDefaults(Language.Defaults):
 | 
					class ItalianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "it"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
| 
						 | 
					@ -24,6 +38,7 @@ class ItalianDefaults(Language.Defaults):
 | 
				
			||||||
class Italian(Language):
 | 
					class Italian(Language):
 | 
				
			||||||
    lang = "it"
 | 
					    lang = "it"
 | 
				
			||||||
    Defaults = ItalianDefaults
 | 
					    Defaults = ItalianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Italian"]
 | 
					__all__ = ["Italian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,21 +1,187 @@
 | 
				
			||||||
 | 
					from typing import Optional, Union, Dict, Any, Set
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from collections import namedtuple, OrderedDict
 | 
					from collections import namedtuple
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from .tag_map import TAG_MAP
 | 
					from .tag_map import TAG_MAP
 | 
				
			||||||
from .tag_orth_map import TAG_ORTH_MAP
 | 
					from .tag_orth_map import TAG_ORTH_MAP
 | 
				
			||||||
from .tag_bigram_map import TAG_BIGRAM_MAP
 | 
					from .tag_bigram_map import TAG_BIGRAM_MAP
 | 
				
			||||||
from ...attrs import LANG
 | 
					 | 
				
			||||||
from ...compat import copy_reg
 | 
					from ...compat import copy_reg
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...symbols import POS
 | 
					from ...symbols import POS
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from ...util import DummyTokenizer
 | 
					from ...util import DummyTokenizer, registry
 | 
				
			||||||
from ... import util
 | 
					from ... import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "ja"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.ja.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.tokenizer]
 | 
				
			||||||
 | 
					@tokenizers = "spacy.JapaneseTokenizer.v1"
 | 
				
			||||||
 | 
					split_mode = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.writing_system]
 | 
				
			||||||
 | 
					direction = "ltr"
 | 
				
			||||||
 | 
					has_case = false
 | 
				
			||||||
 | 
					has_letters = false
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ja.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.tokenizers("spacy.JapaneseTokenizer.v1")
 | 
				
			||||||
 | 
					def create_japanese_tokenizer(split_mode: Optional[str] = None):
 | 
				
			||||||
 | 
					    def japanese_tokenizer_factory(nlp):
 | 
				
			||||||
 | 
					        return JapaneseTokenizer(nlp, split_mode=split_mode)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return japanese_tokenizer_factory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class JapaneseTokenizer(DummyTokenizer):
 | 
				
			||||||
 | 
					    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
 | 
				
			||||||
 | 
					        self.vocab = nlp.vocab
 | 
				
			||||||
 | 
					        self.split_mode = split_mode
 | 
				
			||||||
 | 
					        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __call__(self, text: str) -> Doc:
 | 
				
			||||||
 | 
					        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
 | 
				
			||||||
 | 
					        sudachipy_tokens = self.tokenizer.tokenize(text)
 | 
				
			||||||
 | 
					        dtokens = self._get_dtokens(sudachipy_tokens)
 | 
				
			||||||
 | 
					        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # create Doc with tag bi-gram based part-of-speech identification rules
 | 
				
			||||||
 | 
					        words, tags, inflections, lemmas, readings, sub_tokens_list = (
 | 
				
			||||||
 | 
					            zip(*dtokens) if dtokens else [[]] * 6
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        sub_tokens_list = list(sub_tokens_list)
 | 
				
			||||||
 | 
					        doc = Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					        next_pos = None  # for bi-gram rules
 | 
				
			||||||
 | 
					        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
 | 
				
			||||||
 | 
					            token.tag_ = dtoken.tag
 | 
				
			||||||
 | 
					            if next_pos:  # already identified in previous iteration
 | 
				
			||||||
 | 
					                token.pos = next_pos
 | 
				
			||||||
 | 
					                next_pos = None
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                token.pos, next_pos = resolve_pos(
 | 
				
			||||||
 | 
					                    token.orth_,
 | 
				
			||||||
 | 
					                    dtoken.tag,
 | 
				
			||||||
 | 
					                    tags[idx + 1] if idx + 1 < len(tags) else None,
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            # if there's no lemma info (it's an unk) just use the surface
 | 
				
			||||||
 | 
					            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
 | 
				
			||||||
 | 
					        doc.user_data["inflections"] = inflections
 | 
				
			||||||
 | 
					        doc.user_data["reading_forms"] = readings
 | 
				
			||||||
 | 
					        doc.user_data["sub_tokens"] = sub_tokens_list
 | 
				
			||||||
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
 | 
				
			||||||
 | 
					        sub_tokens_list = (
 | 
				
			||||||
 | 
					            self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        dtokens = [
 | 
				
			||||||
 | 
					            DetailedToken(
 | 
				
			||||||
 | 
					                token.surface(),  # orth
 | 
				
			||||||
 | 
					                "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
 | 
				
			||||||
 | 
					                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
 | 
				
			||||||
 | 
					                token.dictionary_form(),  # lemma
 | 
				
			||||||
 | 
					                token.reading_form(),  # user_data['reading_forms']
 | 
				
			||||||
 | 
					                sub_tokens_list[idx]
 | 
				
			||||||
 | 
					                if sub_tokens_list
 | 
				
			||||||
 | 
					                else None,  # user_data['sub_tokens']
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            for idx, token in enumerate(sudachipy_tokens)
 | 
				
			||||||
 | 
					            if len(token.surface()) > 0
 | 
				
			||||||
 | 
					            # remove empty tokens which can be produced with characters like … that
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					        # Sudachi normalizes internally and outputs each space char as a token.
 | 
				
			||||||
 | 
					        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
 | 
				
			||||||
 | 
					        return [
 | 
				
			||||||
 | 
					            t
 | 
				
			||||||
 | 
					            for idx, t in enumerate(dtokens)
 | 
				
			||||||
 | 
					            if idx == 0
 | 
				
			||||||
 | 
					            or not t.surface.isspace()
 | 
				
			||||||
 | 
					            or t.tag != "空白"
 | 
				
			||||||
 | 
					            or not dtokens[idx - 1].surface.isspace()
 | 
				
			||||||
 | 
					            or dtokens[idx - 1].tag != "空白"
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _get_sub_tokens(self, sudachipy_tokens):
 | 
				
			||||||
 | 
					        if (
 | 
				
			||||||
 | 
					            self.split_mode is None or self.split_mode == "A"
 | 
				
			||||||
 | 
					        ):  # do nothing for default split mode
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
 | 
				
			||||||
 | 
					        for token in sudachipy_tokens:
 | 
				
			||||||
 | 
					            sub_a = token.split(self.tokenizer.SplitMode.A)
 | 
				
			||||||
 | 
					            if len(sub_a) == 1:  # no sub tokens
 | 
				
			||||||
 | 
					                sub_tokens_list.append(None)
 | 
				
			||||||
 | 
					            elif self.split_mode == "B":
 | 
				
			||||||
 | 
					                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
 | 
				
			||||||
 | 
					            else:  # "C"
 | 
				
			||||||
 | 
					                sub_b = token.split(self.tokenizer.SplitMode.B)
 | 
				
			||||||
 | 
					                if len(sub_a) == len(sub_b):
 | 
				
			||||||
 | 
					                    dtokens = self._get_dtokens(sub_a, False)
 | 
				
			||||||
 | 
					                    sub_tokens_list.append([dtokens, dtokens])
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    sub_tokens_list.append(
 | 
				
			||||||
 | 
					                        [
 | 
				
			||||||
 | 
					                            self._get_dtokens(sub_a, False),
 | 
				
			||||||
 | 
					                            self._get_dtokens(sub_b, False),
 | 
				
			||||||
 | 
					                        ]
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					        return sub_tokens_list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _get_config(self) -> Dict[str, Any]:
 | 
				
			||||||
 | 
					        return {"split_mode": self.split_mode}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _set_config(self, config: Dict[str, Any] = {}) -> None:
 | 
				
			||||||
 | 
					        self.split_mode = config.get("split_mode", None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_bytes(self, **kwargs) -> bytes:
 | 
				
			||||||
 | 
					        serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
 | 
				
			||||||
 | 
					        return util.to_bytes(serializers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer":
 | 
				
			||||||
 | 
					        deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
 | 
				
			||||||
 | 
					        util.from_bytes(data, deserializers, [])
 | 
				
			||||||
 | 
					        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
				
			||||||
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
 | 
				
			||||||
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
 | 
					        serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
 | 
				
			||||||
 | 
					        return util.to_disk(path, serializers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
 | 
				
			||||||
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
 | 
					        serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
 | 
				
			||||||
 | 
					        util.from_disk(path, serializers, [])
 | 
				
			||||||
 | 
					        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
				
			||||||
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class JapaneseDefaults(Language.Defaults):
 | 
				
			||||||
 | 
					    tag_map = TAG_MAP
 | 
				
			||||||
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Japanese(Language):
 | 
				
			||||||
 | 
					    lang = "ja"
 | 
				
			||||||
 | 
					    Defaults = JapaneseDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Hold the attributes we need with convenient names
 | 
					# Hold the attributes we need with convenient names
 | 
				
			||||||
DetailedToken = namedtuple(
 | 
					DetailedToken = namedtuple(
 | 
				
			||||||
    "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
 | 
					    "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
 | 
				
			||||||
| 
						 | 
					@ -133,161 +299,6 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
 | 
				
			||||||
    return text_dtokens, text_spaces
 | 
					    return text_dtokens, text_spaces
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class JapaneseTokenizer(DummyTokenizer):
 | 
					 | 
				
			||||||
    def __init__(self, cls, nlp=None, config={}):
 | 
					 | 
				
			||||||
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
					 | 
				
			||||||
        self.split_mode = config.get("split_mode", None)
 | 
					 | 
				
			||||||
        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __call__(self, text):
 | 
					 | 
				
			||||||
        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
 | 
					 | 
				
			||||||
        sudachipy_tokens = self.tokenizer.tokenize(text)
 | 
					 | 
				
			||||||
        dtokens = self._get_dtokens(sudachipy_tokens)
 | 
					 | 
				
			||||||
        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # create Doc with tag bi-gram based part-of-speech identification rules
 | 
					 | 
				
			||||||
        words, tags, inflections, lemmas, readings, sub_tokens_list = (
 | 
					 | 
				
			||||||
            zip(*dtokens) if dtokens else [[]] * 6
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        sub_tokens_list = list(sub_tokens_list)
 | 
					 | 
				
			||||||
        doc = Doc(self.vocab, words=words, spaces=spaces)
 | 
					 | 
				
			||||||
        next_pos = None  # for bi-gram rules
 | 
					 | 
				
			||||||
        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
 | 
					 | 
				
			||||||
            token.tag_ = dtoken.tag
 | 
					 | 
				
			||||||
            if next_pos:  # already identified in previous iteration
 | 
					 | 
				
			||||||
                token.pos = next_pos
 | 
					 | 
				
			||||||
                next_pos = None
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                token.pos, next_pos = resolve_pos(
 | 
					 | 
				
			||||||
                    token.orth_,
 | 
					 | 
				
			||||||
                    dtoken.tag,
 | 
					 | 
				
			||||||
                    tags[idx + 1] if idx + 1 < len(tags) else None,
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            # if there's no lemma info (it's an unk) just use the surface
 | 
					 | 
				
			||||||
            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        doc.user_data["inflections"] = inflections
 | 
					 | 
				
			||||||
        doc.user_data["reading_forms"] = readings
 | 
					 | 
				
			||||||
        doc.user_data["sub_tokens"] = sub_tokens_list
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
 | 
					 | 
				
			||||||
        sub_tokens_list = (
 | 
					 | 
				
			||||||
            self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        dtokens = [
 | 
					 | 
				
			||||||
            DetailedToken(
 | 
					 | 
				
			||||||
                token.surface(),  # orth
 | 
					 | 
				
			||||||
                "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]),  # tag
 | 
					 | 
				
			||||||
                ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]),  # inf
 | 
					 | 
				
			||||||
                token.dictionary_form(),  # lemma
 | 
					 | 
				
			||||||
                token.reading_form(),  # user_data['reading_forms']
 | 
					 | 
				
			||||||
                sub_tokens_list[idx]
 | 
					 | 
				
			||||||
                if sub_tokens_list
 | 
					 | 
				
			||||||
                else None,  # user_data['sub_tokens']
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            for idx, token in enumerate(sudachipy_tokens)
 | 
					 | 
				
			||||||
            if len(token.surface()) > 0
 | 
					 | 
				
			||||||
            # remove empty tokens which can be produced with characters like … that
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
        # Sudachi normalizes internally and outputs each space char as a token.
 | 
					 | 
				
			||||||
        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
 | 
					 | 
				
			||||||
        return [
 | 
					 | 
				
			||||||
            t
 | 
					 | 
				
			||||||
            for idx, t in enumerate(dtokens)
 | 
					 | 
				
			||||||
            if idx == 0
 | 
					 | 
				
			||||||
            or not t.surface.isspace()
 | 
					 | 
				
			||||||
            or t.tag != "空白"
 | 
					 | 
				
			||||||
            or not dtokens[idx - 1].surface.isspace()
 | 
					 | 
				
			||||||
            or dtokens[idx - 1].tag != "空白"
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _get_sub_tokens(self, sudachipy_tokens):
 | 
					 | 
				
			||||||
        if (
 | 
					 | 
				
			||||||
            self.split_mode is None or self.split_mode == "A"
 | 
					 | 
				
			||||||
        ):  # do nothing for default split mode
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
 | 
					 | 
				
			||||||
        for token in sudachipy_tokens:
 | 
					 | 
				
			||||||
            sub_a = token.split(self.tokenizer.SplitMode.A)
 | 
					 | 
				
			||||||
            if len(sub_a) == 1:  # no sub tokens
 | 
					 | 
				
			||||||
                sub_tokens_list.append(None)
 | 
					 | 
				
			||||||
            elif self.split_mode == "B":
 | 
					 | 
				
			||||||
                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
 | 
					 | 
				
			||||||
            else:  # "C"
 | 
					 | 
				
			||||||
                sub_b = token.split(self.tokenizer.SplitMode.B)
 | 
					 | 
				
			||||||
                if len(sub_a) == len(sub_b):
 | 
					 | 
				
			||||||
                    dtokens = self._get_dtokens(sub_a, False)
 | 
					 | 
				
			||||||
                    sub_tokens_list.append([dtokens, dtokens])
 | 
					 | 
				
			||||||
                else:
 | 
					 | 
				
			||||||
                    sub_tokens_list.append(
 | 
					 | 
				
			||||||
                        [
 | 
					 | 
				
			||||||
                            self._get_dtokens(sub_a, False),
 | 
					 | 
				
			||||||
                            self._get_dtokens(sub_b, False),
 | 
					 | 
				
			||||||
                        ]
 | 
					 | 
				
			||||||
                    )
 | 
					 | 
				
			||||||
        return sub_tokens_list
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _get_config(self):
 | 
					 | 
				
			||||||
        config = OrderedDict((("split_mode", self.split_mode),))
 | 
					 | 
				
			||||||
        return config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _set_config(self, config={}):
 | 
					 | 
				
			||||||
        self.split_mode = config.get("split_mode", None)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def to_bytes(self, **kwargs):
 | 
					 | 
				
			||||||
        serializers = OrderedDict(
 | 
					 | 
				
			||||||
            (("cfg", lambda: srsly.json_dumps(self._get_config())),)
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        return util.to_bytes(serializers, [])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def from_bytes(self, data, **kwargs):
 | 
					 | 
				
			||||||
        deserializers = OrderedDict(
 | 
					 | 
				
			||||||
            (("cfg", lambda b: self._set_config(srsly.json_loads(b))),)
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        util.from_bytes(data, deserializers, [])
 | 
					 | 
				
			||||||
        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
					 | 
				
			||||||
        return self
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def to_disk(self, path, **kwargs):
 | 
					 | 
				
			||||||
        path = util.ensure_path(path)
 | 
					 | 
				
			||||||
        serializers = OrderedDict(
 | 
					 | 
				
			||||||
            (("cfg", lambda p: srsly.write_json(p, self._get_config())),)
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        return util.to_disk(path, serializers, [])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def from_disk(self, path, **kwargs):
 | 
					 | 
				
			||||||
        path = util.ensure_path(path)
 | 
					 | 
				
			||||||
        serializers = OrderedDict(
 | 
					 | 
				
			||||||
            (("cfg", lambda p: self._set_config(srsly.read_json(p))),)
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        util.from_disk(path, serializers, [])
 | 
					 | 
				
			||||||
        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class JapaneseDefaults(Language.Defaults):
 | 
					 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda _text: "ja"
 | 
					 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    tag_map = TAG_MAP
 | 
					 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					 | 
				
			||||||
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def create_tokenizer(cls, nlp=None, config={}):
 | 
					 | 
				
			||||||
        return JapaneseTokenizer(cls, nlp, config)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class Japanese(Language):
 | 
					 | 
				
			||||||
    lang = "ja"
 | 
					 | 
				
			||||||
    Defaults = JapaneseDefaults
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def make_doc(self, text):
 | 
					 | 
				
			||||||
        return self.tokenizer(text)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def pickle_japanese(instance):
 | 
					def pickle_japanese(instance):
 | 
				
			||||||
    return Japanese, tuple()
 | 
					    return Japanese, tuple()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class KannadaDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "kn"
 | 
					lang = "kn"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					stop_words = {"@language_data": "spacy.kn.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.kn.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Kannada(Language):
 | 
					class Kannada(Language):
 | 
				
			||||||
    lang = "kn"
 | 
					    lang = "kn"
 | 
				
			||||||
    Defaults = KannadaDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Kannada"]
 | 
					__all__ = ["Kannada"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,51 +1,52 @@
 | 
				
			||||||
 | 
					from typing import Set, Optional, Any, Dict
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tag_map import TAG_MAP
 | 
					from .tag_map import TAG_MAP
 | 
				
			||||||
from ...attrs import LANG
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from ...compat import copy_reg
 | 
					from ...compat import copy_reg
 | 
				
			||||||
from ...util import DummyTokenizer
 | 
					from ...util import DummyTokenizer, registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def try_mecab_import():
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    try:
 | 
					[nlp]
 | 
				
			||||||
        from natto import MeCab
 | 
					lang = "ko"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.ko.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return MeCab
 | 
					[nlp.tokenizer]
 | 
				
			||||||
    except ImportError:
 | 
					@tokenizers = "spacy.KoreanTokenizer.v1"
 | 
				
			||||||
        raise ImportError(
 | 
					
 | 
				
			||||||
            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
 | 
					[nlp.writing_system]
 | 
				
			||||||
            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
 | 
					direction = "ltr"
 | 
				
			||||||
            "and [natto-py](https://github.com/buruzaemon/natto-py)"
 | 
					has_case = false
 | 
				
			||||||
        )
 | 
					has_letters = false
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# fmt: on
 | 
					@registry.language_data("spacy.ko.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def check_spaces(text, tokens):
 | 
					@registry.tokenizers("spacy.KoreanTokenizer.v1")
 | 
				
			||||||
    prev_end = -1
 | 
					def create_korean_tokenizer():
 | 
				
			||||||
    start = 0
 | 
					    def korean_tokenizer_factory(nlp):
 | 
				
			||||||
    for token in tokens:
 | 
					        return KoreanTokenizer(nlp)
 | 
				
			||||||
        idx = text.find(token, start)
 | 
					
 | 
				
			||||||
        if prev_end > 0:
 | 
					    return korean_tokenizer_factory
 | 
				
			||||||
            yield prev_end != idx
 | 
					 | 
				
			||||||
        prev_end = idx + len(token)
 | 
					 | 
				
			||||||
        start = prev_end
 | 
					 | 
				
			||||||
    if start > 0:
 | 
					 | 
				
			||||||
        yield False
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class KoreanTokenizer(DummyTokenizer):
 | 
					class KoreanTokenizer(DummyTokenizer):
 | 
				
			||||||
    def __init__(self, cls, nlp=None):
 | 
					    def __init__(self, nlp: Optional[Language] = None):
 | 
				
			||||||
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
					        self.vocab = nlp.vocab
 | 
				
			||||||
        MeCab = try_mecab_import()
 | 
					        MeCab = try_mecab_import()
 | 
				
			||||||
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
 | 
					        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __del__(self):
 | 
					    def __del__(self):
 | 
				
			||||||
        self.mecab_tokenizer.__del__()
 | 
					        self.mecab_tokenizer.__del__()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, text):
 | 
					    def __call__(self, text: str) -> Doc:
 | 
				
			||||||
        dtokens = list(self.detailed_tokens(text))
 | 
					        dtokens = list(self.detailed_tokens(text))
 | 
				
			||||||
        surfaces = [dt["surface"] for dt in dtokens]
 | 
					        surfaces = [dt["surface"] for dt in dtokens]
 | 
				
			||||||
        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
 | 
					        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
 | 
				
			||||||
| 
						 | 
					@ -56,7 +57,7 @@ class KoreanTokenizer(DummyTokenizer):
 | 
				
			||||||
        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
 | 
					        doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def detailed_tokens(self, text):
 | 
					    def detailed_tokens(self, text: str) -> Dict[str, Any]:
 | 
				
			||||||
        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
 | 
					        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
 | 
				
			||||||
        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
 | 
					        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
 | 
				
			||||||
        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
 | 
					        for node in self.mecab_tokenizer.parse(text, as_nodes=True):
 | 
				
			||||||
| 
						 | 
					@ -72,23 +73,39 @@ class KoreanTokenizer(DummyTokenizer):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class KoreanDefaults(Language.Defaults):
 | 
					class KoreanDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda _text: "ko"
 | 
					 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    tag_map = TAG_MAP
 | 
					    tag_map = TAG_MAP
 | 
				
			||||||
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def create_tokenizer(cls, nlp=None):
 | 
					 | 
				
			||||||
        return KoreanTokenizer(cls, nlp)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Korean(Language):
 | 
					class Korean(Language):
 | 
				
			||||||
    lang = "ko"
 | 
					    lang = "ko"
 | 
				
			||||||
    Defaults = KoreanDefaults
 | 
					    Defaults = KoreanDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def make_doc(self, text):
 | 
					
 | 
				
			||||||
        return self.tokenizer(text)
 | 
					def try_mecab_import() -> None:
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        from natto import MeCab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return MeCab
 | 
				
			||||||
 | 
					    except ImportError:
 | 
				
			||||||
 | 
					        raise ImportError(
 | 
				
			||||||
 | 
					            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
 | 
				
			||||||
 | 
					            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
 | 
				
			||||||
 | 
					            "and [natto-py](https://github.com/buruzaemon/natto-py)"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def check_spaces(text, tokens):
 | 
				
			||||||
 | 
					    prev_end = -1
 | 
				
			||||||
 | 
					    start = 0
 | 
				
			||||||
 | 
					    for token in tokens:
 | 
				
			||||||
 | 
					        idx = text.find(token, start)
 | 
				
			||||||
 | 
					        if prev_end > 0:
 | 
				
			||||||
 | 
					            yield prev_end != idx
 | 
				
			||||||
 | 
					        prev_end = idx + len(token)
 | 
				
			||||||
 | 
					        start = prev_end
 | 
				
			||||||
 | 
					    if start > 0:
 | 
				
			||||||
 | 
					        yield False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def pickle_korean(instance):
 | 
					def pickle_korean(instance):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,49 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "lb"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.lb.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.lb.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.lb.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LuxembourgishDefaults(Language.Defaults):
 | 
					class LuxembourgishDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "lb"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Luxembourgish(Language):
 | 
					class Luxembourgish(Language):
 | 
				
			||||||
    lang = "lb"
 | 
					    lang = "lb"
 | 
				
			||||||
    Defaults = LuxembourgishDefaults
 | 
					    Defaults = LuxembourgishDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Luxembourgish"]
 | 
					__all__ = ["Luxembourgish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
import unicodedata
 | 
					import unicodedata
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -21,21 +22,21 @@ _tlds = set(
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_punct(text):
 | 
					def is_punct(text: str) -> bool:
 | 
				
			||||||
    for char in text:
 | 
					    for char in text:
 | 
				
			||||||
        if not unicodedata.category(char).startswith("P"):
 | 
					        if not unicodedata.category(char).startswith("P"):
 | 
				
			||||||
            return False
 | 
					            return False
 | 
				
			||||||
    return True
 | 
					    return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_ascii(text):
 | 
					def is_ascii(text: str) -> bool:
 | 
				
			||||||
    for char in text:
 | 
					    for char in text:
 | 
				
			||||||
        if ord(char) >= 128:
 | 
					        if ord(char) >= 128:
 | 
				
			||||||
            return False
 | 
					            return False
 | 
				
			||||||
    return True
 | 
					    return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def like_num(text):
 | 
					def like_num(text: str) -> bool:
 | 
				
			||||||
    if text.startswith(("+", "-", "±", "~")):
 | 
					    if text.startswith(("+", "-", "±", "~")):
 | 
				
			||||||
        text = text[1:]
 | 
					        text = text[1:]
 | 
				
			||||||
    # can be overwritten by lang with list of number words
 | 
					    # can be overwritten by lang with list of number words
 | 
				
			||||||
| 
						 | 
					@ -49,64 +50,31 @@ def like_num(text):
 | 
				
			||||||
    return False
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_bracket(text):
 | 
					def is_bracket(text: str) -> bool:
 | 
				
			||||||
    brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
 | 
					    brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
 | 
				
			||||||
    return text in brackets
 | 
					    return text in brackets
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_quote(text):
 | 
					def is_quote(text: str) -> bool:
 | 
				
			||||||
    quotes = (
 | 
					    # fmt: off
 | 
				
			||||||
        '"',
 | 
					    quotes = ('"', "'", "`", "«", "»", "‘", "’", "‚", "‛", "“", "”", "„", "‟", "‹", "›", "❮", "❯", "''", "``")
 | 
				
			||||||
        "'",
 | 
					    # fmt: on
 | 
				
			||||||
        "`",
 | 
					 | 
				
			||||||
        "«",
 | 
					 | 
				
			||||||
        "»",
 | 
					 | 
				
			||||||
        "‘",
 | 
					 | 
				
			||||||
        "’",
 | 
					 | 
				
			||||||
        "‚",
 | 
					 | 
				
			||||||
        "‛",
 | 
					 | 
				
			||||||
        "“",
 | 
					 | 
				
			||||||
        "”",
 | 
					 | 
				
			||||||
        "„",
 | 
					 | 
				
			||||||
        "‟",
 | 
					 | 
				
			||||||
        "‹",
 | 
					 | 
				
			||||||
        "›",
 | 
					 | 
				
			||||||
        "❮",
 | 
					 | 
				
			||||||
        "❯",
 | 
					 | 
				
			||||||
        "''",
 | 
					 | 
				
			||||||
        "``",
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    return text in quotes
 | 
					    return text in quotes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_left_punct(text):
 | 
					def is_left_punct(text: str) -> bool:
 | 
				
			||||||
    left_punct = (
 | 
					    # fmt: off
 | 
				
			||||||
        "(",
 | 
					    left_punct = ("(", "[", "{", "<", '"', "'", "«", "‘", "‚", "‛", "“", "„", "‟", "‹", "❮", "``")
 | 
				
			||||||
        "[",
 | 
					    # fmt: on
 | 
				
			||||||
        "{",
 | 
					 | 
				
			||||||
        "<",
 | 
					 | 
				
			||||||
        '"',
 | 
					 | 
				
			||||||
        "'",
 | 
					 | 
				
			||||||
        "«",
 | 
					 | 
				
			||||||
        "‘",
 | 
					 | 
				
			||||||
        "‚",
 | 
					 | 
				
			||||||
        "‛",
 | 
					 | 
				
			||||||
        "“",
 | 
					 | 
				
			||||||
        "„",
 | 
					 | 
				
			||||||
        "‟",
 | 
					 | 
				
			||||||
        "‹",
 | 
					 | 
				
			||||||
        "❮",
 | 
					 | 
				
			||||||
        "``",
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    return text in left_punct
 | 
					    return text in left_punct
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_right_punct(text):
 | 
					def is_right_punct(text: str) -> bool:
 | 
				
			||||||
    right_punct = (")", "]", "}", ">", '"', "'", "»", "’", "”", "›", "❯", "''")
 | 
					    right_punct = (")", "]", "}", ">", '"', "'", "»", "’", "”", "›", "❯", "''")
 | 
				
			||||||
    return text in right_punct
 | 
					    return text in right_punct
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_currency(text):
 | 
					def is_currency(text: str) -> bool:
 | 
				
			||||||
    # can be overwritten by lang with list of currency words, e.g. dollar, euro
 | 
					    # can be overwritten by lang with list of currency words, e.g. dollar, euro
 | 
				
			||||||
    for char in text:
 | 
					    for char in text:
 | 
				
			||||||
        if unicodedata.category(char) != "Sc":
 | 
					        if unicodedata.category(char) != "Sc":
 | 
				
			||||||
| 
						 | 
					@ -114,11 +82,11 @@ def is_currency(text):
 | 
				
			||||||
    return True
 | 
					    return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def like_email(text):
 | 
					def like_email(text: str) -> bool:
 | 
				
			||||||
    return bool(_like_email(text))
 | 
					    return bool(_like_email(text))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def like_url(text):
 | 
					def like_url(text: str) -> bool:
 | 
				
			||||||
    # We're looking for things that function in text like URLs. So, valid URL
 | 
					    # We're looking for things that function in text like URLs. So, valid URL
 | 
				
			||||||
    # or not, anything they say http:// is going to be good.
 | 
					    # or not, anything they say http:// is going to be good.
 | 
				
			||||||
    if text.startswith("http://") or text.startswith("https://"):
 | 
					    if text.startswith("http://") or text.startswith("https://"):
 | 
				
			||||||
| 
						 | 
					@ -144,7 +112,7 @@ def like_url(text):
 | 
				
			||||||
    return False
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def word_shape(text):
 | 
					def word_shape(text: str) -> str:
 | 
				
			||||||
    if len(text) >= 100:
 | 
					    if len(text) >= 100:
 | 
				
			||||||
        return "LONG"
 | 
					        return "LONG"
 | 
				
			||||||
    shape = []
 | 
					    shape = []
 | 
				
			||||||
| 
						 | 
					@ -171,46 +139,52 @@ def word_shape(text):
 | 
				
			||||||
    return "".join(shape)
 | 
					    return "".join(shape)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def lower(string):
 | 
					def lower(string: str) -> str:
 | 
				
			||||||
    return string.lower()
 | 
					    return string.lower()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def prefix(string):
 | 
					def prefix(string: str) -> str:
 | 
				
			||||||
    return string[0]
 | 
					    return string[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def suffix(string):
 | 
					def suffix(string: str) -> str:
 | 
				
			||||||
    return string[-3:]
 | 
					    return string[-3:]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_alpha(string):
 | 
					def is_alpha(string: str) -> bool:
 | 
				
			||||||
    return string.isalpha()
 | 
					    return string.isalpha()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_digit(string):
 | 
					def is_digit(string: str) -> bool:
 | 
				
			||||||
    return string.isdigit()
 | 
					    return string.isdigit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_lower(string):
 | 
					def is_lower(string: str) -> bool:
 | 
				
			||||||
    return string.islower()
 | 
					    return string.islower()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_space(string):
 | 
					def is_space(string: str) -> bool:
 | 
				
			||||||
    return string.isspace()
 | 
					    return string.isspace()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_title(string):
 | 
					def is_title(string: str) -> bool:
 | 
				
			||||||
    return string.istitle()
 | 
					    return string.istitle()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_upper(string):
 | 
					def is_upper(string: str) -> bool:
 | 
				
			||||||
    return string.isupper()
 | 
					    return string.isupper()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_stop(string, stops=set()):
 | 
					def is_stop(string: str, stops: Set[str] = set()) -> bool:
 | 
				
			||||||
    return string.lower() in stops
 | 
					    return string.lower() in stops
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_lang(text: str, lang: str = "") -> str:
 | 
				
			||||||
 | 
					    # This function is partially applied so lang code can be passed in
 | 
				
			||||||
 | 
					    # automatically while still allowing pickling
 | 
				
			||||||
 | 
					    return lang
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
LEX_ATTRS = {
 | 
					LEX_ATTRS = {
 | 
				
			||||||
    attrs.LOWER: lower,
 | 
					    attrs.LOWER: lower,
 | 
				
			||||||
    attrs.NORM: lower,
 | 
					    attrs.NORM: lower,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,28 +1,35 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "lij"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.lij.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.lij.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LigurianDefaults(Language.Defaults):
 | 
					class LigurianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "lij"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Ligurian(Language):
 | 
					class Ligurian(Language):
 | 
				
			||||||
    lang = "lij"
 | 
					    lang = "lij"
 | 
				
			||||||
    Defaults = LigurianDefaults
 | 
					    Defaults = LigurianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Ligurian"]
 | 
					__all__ = ["Ligurian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,27 +1,41 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _return_lt(_):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    return "lt"
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "lt"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.lt.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.lt.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.lt.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LithuanianDefaults(Language.Defaults):
 | 
					class LithuanianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = _return_lt
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    mod_base_exceptions = {
 | 
					    mod_base_exceptions = {
 | 
				
			||||||
| 
						 | 
					@ -29,12 +43,12 @@ class LithuanianDefaults(Language.Defaults):
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    del mod_base_exceptions["8)"]
 | 
					    del mod_base_exceptions["8)"]
 | 
				
			||||||
    tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Lithuanian(Language):
 | 
					class Lithuanian(Language):
 | 
				
			||||||
    lang = "lt"
 | 
					    lang = "lt"
 | 
				
			||||||
    Defaults = LithuanianDefaults
 | 
					    Defaults = LithuanianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Lithuanian"]
 | 
					__all__ = ["Lithuanian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LatvianDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "lv"
 | 
					lang = "lv"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					stop_words = {"@language_data": "spacy.lv.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.lv.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Latvian(Language):
 | 
					class Latvian(Language):
 | 
				
			||||||
    lang = "lv"
 | 
					    lang = "lv"
 | 
				
			||||||
    Defaults = LatvianDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Latvian"]
 | 
					__all__ = ["Latvian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class MalayalamDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "ml"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.ml.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ml.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Malayalam(Language):
 | 
					class Malayalam(Language):
 | 
				
			||||||
    lang = "ml"
 | 
					    lang = "ml"
 | 
				
			||||||
    Defaults = MalayalamDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Malayalam"]
 | 
					__all__ = ["Malayalam"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class MarathiDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "mr"
 | 
					lang = "af"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					stop_words = {"@language_data": "spacy.mr.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.mr.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Marathi(Language):
 | 
					class Marathi(Language):
 | 
				
			||||||
    lang = "mr"
 | 
					    lang = "mr"
 | 
				
			||||||
    Defaults = MarathiDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Marathi"]
 | 
					__all__ = ["Marathi"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,47 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "nb"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.nb.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.nb.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class NorwegianDefaults(Language.Defaults):
 | 
					class NorwegianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "nb"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Norwegian(Language):
 | 
					class Norwegian(Language):
 | 
				
			||||||
    lang = "nb"
 | 
					    lang = "nb"
 | 
				
			||||||
    Defaults = NorwegianDefaults
 | 
					    Defaults = NorwegianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Norwegian"]
 | 
					__all__ = ["Norwegian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,23 +1,33 @@
 | 
				
			||||||
# coding: utf8
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class NepaliDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					lang = "ne"
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code
 | 
					stop_words = {"@language_data": "spacy.ne.stop_words"}
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ne.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ne.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Nepali(Language):
 | 
					class Nepali(Language):
 | 
				
			||||||
    lang = "ne"
 | 
					    lang = "ne"
 | 
				
			||||||
    Defaults = NepaliDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Nepali"]
 | 
					__all__ = ["Nepali"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,6 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
| 
						 | 
					@ -5,36 +8,51 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from .lemmatizer import DutchLemmatizer
 | 
					from .lemmatizer import DutchLemmatizer
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...lookups import Lookups
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "nl"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.nl.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.DutchLemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.nl.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.nl.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.lemmatizers("spacy.DutchLemmatizer.v1")
 | 
				
			||||||
 | 
					def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
 | 
				
			||||||
 | 
					    return DutchLemmatizer(data_paths=data_paths)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DutchDefaults(Language.Defaults):
 | 
					class DutchDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "nl"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
					 | 
				
			||||||
        if lookups is None:
 | 
					 | 
				
			||||||
            lookups = Lookups()
 | 
					 | 
				
			||||||
        return DutchLemmatizer(lookups)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Dutch(Language):
 | 
					class Dutch(Language):
 | 
				
			||||||
    lang = "nl"
 | 
					    lang = "nl"
 | 
				
			||||||
    Defaults = DutchDefaults
 | 
					    Defaults = DutchDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Dutch"]
 | 
					__all__ = ["Dutch"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,5 @@
 | 
				
			||||||
 | 
					from typing import Optional, List, Dict, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...lemmatizer import Lemmatizer
 | 
					from ...lemmatizer import Lemmatizer
 | 
				
			||||||
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
 | 
					from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,7 +36,9 @@ class DutchLemmatizer(Lemmatizer):
 | 
				
			||||||
        "num": "num",
 | 
					        "num": "num",
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
					    def __call__(
 | 
				
			||||||
 | 
					        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        # Difference 1: self.rules is assumed to be non-None, so no
 | 
					        # Difference 1: self.rules is assumed to be non-None, so no
 | 
				
			||||||
        # 'is None' check required.
 | 
					        # 'is None' check required.
 | 
				
			||||||
        # String lowercased from the get-go. All lemmatization results in
 | 
					        # String lowercased from the get-go. All lemmatization results in
 | 
				
			||||||
| 
						 | 
					@ -92,7 +96,7 @@ class DutchLemmatizer(Lemmatizer):
 | 
				
			||||||
    # Overrides parent method so that a lowercased version of the string is
 | 
					    # Overrides parent method so that a lowercased version of the string is
 | 
				
			||||||
    # used to search the lookup table. This is necessary because our lookup
 | 
					    # used to search the lookup table. This is necessary because our lookup
 | 
				
			||||||
    # table consists entirely of lowercase keys.
 | 
					    # table consists entirely of lowercase keys.
 | 
				
			||||||
    def lookup(self, string, orth=None):
 | 
					    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
				
			||||||
        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
					        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
				
			||||||
        string = string.lower()
 | 
					        string = string.lower()
 | 
				
			||||||
        if orth is not None:
 | 
					        if orth is not None:
 | 
				
			||||||
| 
						 | 
					@ -102,7 +106,13 @@ class DutchLemmatizer(Lemmatizer):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Reimplemented to focus more on application of suffix rules and to return
 | 
					    # Reimplemented to focus more on application of suffix rules and to return
 | 
				
			||||||
    # as early as possible.
 | 
					    # as early as possible.
 | 
				
			||||||
    def lemmatize(self, string, index, exceptions, rules):
 | 
					    def lemmatize(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        string: str,
 | 
				
			||||||
 | 
					        index: Dict[str, List[str]],
 | 
				
			||||||
 | 
					        exceptions: Dict[str, Dict[str, List[str]]],
 | 
				
			||||||
 | 
					        rules: Dict[str, List[List[str]]],
 | 
				
			||||||
 | 
					    ) -> Tuple[List[str], bool]:
 | 
				
			||||||
        # returns (forms, is_known: bool)
 | 
					        # returns (forms, is_known: bool)
 | 
				
			||||||
        oov_forms = []
 | 
					        oov_forms = []
 | 
				
			||||||
        for old, new in rules:
 | 
					        for old, new in rules:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,43 +1,60 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .lemmatizer import PolishLemmatizer
 | 
					from .lemmatizer import PolishLemmatizer
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import registry
 | 
				
			||||||
from ...util import add_lookups
 | 
					
 | 
				
			||||||
from ...lookups import Lookups
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "pl"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.pl.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.PolishLemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.pl.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.pl.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.lemmatizers("spacy.PolishLemmatizer.v1")
 | 
				
			||||||
 | 
					def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
 | 
				
			||||||
 | 
					    return PolishLemmatizer(data_paths=data_paths)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PolishDefaults(Language.Defaults):
 | 
					class PolishDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "pl"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    mod_base_exceptions = {
 | 
					    mod_base_exceptions = {
 | 
				
			||||||
        exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
 | 
					        exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    tokenizer_exceptions = mod_base_exceptions
 | 
					    tokenizer_exceptions = mod_base_exceptions
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
					 | 
				
			||||||
        if lookups is None:
 | 
					 | 
				
			||||||
            lookups = Lookups()
 | 
					 | 
				
			||||||
        return PolishLemmatizer(lookups)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Polish(Language):
 | 
					class Polish(Language):
 | 
				
			||||||
    lang = "pl"
 | 
					    lang = "pl"
 | 
				
			||||||
    Defaults = PolishDefaults
 | 
					    Defaults = PolishDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Polish"]
 | 
					__all__ = ["Polish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,5 @@
 | 
				
			||||||
 | 
					from typing import Optional, List, Dict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...lemmatizer import Lemmatizer
 | 
					from ...lemmatizer import Lemmatizer
 | 
				
			||||||
from ...parts_of_speech import NAMES
 | 
					from ...parts_of_speech import NAMES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,30 +9,29 @@ class PolishLemmatizer(Lemmatizer):
 | 
				
			||||||
    # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
 | 
					    # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
 | 
				
			||||||
    # It utilizes some prefix based improvements for verb and adjectives
 | 
					    # It utilizes some prefix based improvements for verb and adjectives
 | 
				
			||||||
    # lemmatization, as well as case-sensitive lemmatization for nouns.
 | 
					    # lemmatization, as well as case-sensitive lemmatization for nouns.
 | 
				
			||||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
					    def __call__(
 | 
				
			||||||
 | 
					        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        if isinstance(univ_pos, int):
 | 
					        if isinstance(univ_pos, int):
 | 
				
			||||||
            univ_pos = NAMES.get(univ_pos, "X")
 | 
					            univ_pos = NAMES.get(univ_pos, "X")
 | 
				
			||||||
        univ_pos = univ_pos.upper()
 | 
					        univ_pos = univ_pos.upper()
 | 
				
			||||||
 | 
					 | 
				
			||||||
        lookup_pos = univ_pos.lower()
 | 
					        lookup_pos = univ_pos.lower()
 | 
				
			||||||
        if univ_pos == "PROPN":
 | 
					        if univ_pos == "PROPN":
 | 
				
			||||||
            lookup_pos = "noun"
 | 
					            lookup_pos = "noun"
 | 
				
			||||||
        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
 | 
					        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if univ_pos == "NOUN":
 | 
					        if univ_pos == "NOUN":
 | 
				
			||||||
            return self.lemmatize_noun(string, morphology, lookup_table)
 | 
					            return self.lemmatize_noun(string, morphology, lookup_table)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if univ_pos != "PROPN":
 | 
					        if univ_pos != "PROPN":
 | 
				
			||||||
            string = string.lower()
 | 
					            string = string.lower()
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if univ_pos == "ADJ":
 | 
					        if univ_pos == "ADJ":
 | 
				
			||||||
            return self.lemmatize_adj(string, morphology, lookup_table)
 | 
					            return self.lemmatize_adj(string, morphology, lookup_table)
 | 
				
			||||||
        elif univ_pos == "VERB":
 | 
					        elif univ_pos == "VERB":
 | 
				
			||||||
            return self.lemmatize_verb(string, morphology, lookup_table)
 | 
					            return self.lemmatize_verb(string, morphology, lookup_table)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        return [lookup_table.get(string, string.lower())]
 | 
					        return [lookup_table.get(string, string.lower())]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lemmatize_adj(self, string, morphology, lookup_table):
 | 
					    def lemmatize_adj(
 | 
				
			||||||
 | 
					        self, string: str, morphology: dict, lookup_table: Dict[str, str]
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        # this method utilizes different procedures for adjectives
 | 
					        # this method utilizes different procedures for adjectives
 | 
				
			||||||
        # with 'nie' and 'naj' prefixes
 | 
					        # with 'nie' and 'naj' prefixes
 | 
				
			||||||
        if string[:3] == "nie":
 | 
					        if string[:3] == "nie":
 | 
				
			||||||
| 
						 | 
					@ -41,25 +42,26 @@ class PolishLemmatizer(Lemmatizer):
 | 
				
			||||||
                    return [lookup_table[naj_search_string]]
 | 
					                    return [lookup_table[naj_search_string]]
 | 
				
			||||||
            if search_string in lookup_table:
 | 
					            if search_string in lookup_table:
 | 
				
			||||||
                return [lookup_table[search_string]]
 | 
					                return [lookup_table[search_string]]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if string[:3] == "naj":
 | 
					        if string[:3] == "naj":
 | 
				
			||||||
            naj_search_string = string[3:]
 | 
					            naj_search_string = string[3:]
 | 
				
			||||||
            if naj_search_string in lookup_table:
 | 
					            if naj_search_string in lookup_table:
 | 
				
			||||||
                return [lookup_table[naj_search_string]]
 | 
					                return [lookup_table[naj_search_string]]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        return [lookup_table.get(string, string)]
 | 
					        return [lookup_table.get(string, string)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lemmatize_verb(self, string, morphology, lookup_table):
 | 
					    def lemmatize_verb(
 | 
				
			||||||
 | 
					        self, string: str, morphology: dict, lookup_table: Dict[str, str]
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        # this method utilizes a different procedure for verbs
 | 
					        # this method utilizes a different procedure for verbs
 | 
				
			||||||
        # with 'nie' prefix
 | 
					        # with 'nie' prefix
 | 
				
			||||||
        if string[:3] == "nie":
 | 
					        if string[:3] == "nie":
 | 
				
			||||||
            search_string = string[3:]
 | 
					            search_string = string[3:]
 | 
				
			||||||
            if search_string in lookup_table:
 | 
					            if search_string in lookup_table:
 | 
				
			||||||
                return [lookup_table[search_string]]
 | 
					                return [lookup_table[search_string]]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        return [lookup_table.get(string, string)]
 | 
					        return [lookup_table.get(string, string)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lemmatize_noun(self, string, morphology, lookup_table):
 | 
					    def lemmatize_noun(
 | 
				
			||||||
 | 
					        self, string: str, morphology: dict, lookup_table: Dict[str, str]
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        # this method is case-sensitive, in order to work
 | 
					        # this method is case-sensitive, in order to work
 | 
				
			||||||
        # for incorrectly tagged proper names
 | 
					        # for incorrectly tagged proper names
 | 
				
			||||||
        if string != string.lower():
 | 
					        if string != string.lower():
 | 
				
			||||||
| 
						 | 
					@ -68,11 +70,16 @@ class PolishLemmatizer(Lemmatizer):
 | 
				
			||||||
            elif string in lookup_table:
 | 
					            elif string in lookup_table:
 | 
				
			||||||
                return [lookup_table[string]]
 | 
					                return [lookup_table[string]]
 | 
				
			||||||
            return [string.lower()]
 | 
					            return [string.lower()]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        return [lookup_table.get(string, string)]
 | 
					        return [lookup_table.get(string, string)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lookup(self, string, orth=None):
 | 
					    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
				
			||||||
        return string.lower()
 | 
					        return string.lower()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lemmatize(self, string, index, exceptions, rules):
 | 
					    def lemmatize(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        string: str,
 | 
				
			||||||
 | 
					        index: Dict[str, List[str]],
 | 
				
			||||||
 | 
					        exceptions: Dict[str, Dict[str, List[str]]],
 | 
				
			||||||
 | 
					        rules: Dict[str, List[List[str]]],
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        raise NotImplementedError
 | 
					        raise NotImplementedError
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,42 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "pt"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.pt.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.pt.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.pt.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PortugueseDefaults(Language.Defaults):
 | 
					class PortugueseDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "pt"
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,6 +44,7 @@ class PortugueseDefaults(Language.Defaults):
 | 
				
			||||||
class Portuguese(Language):
 | 
					class Portuguese(Language):
 | 
				
			||||||
    lang = "pt"
 | 
					    lang = "pt"
 | 
				
			||||||
    Defaults = PortugueseDefaults
 | 
					    Defaults = PortugueseDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Portuguese"]
 | 
					__all__ = ["Portuguese"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,27 +1,40 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Lemma data note:
 | 
					# Lemma data note:
 | 
				
			||||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
 | 
					# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
 | 
				
			||||||
# Replaced characters using cedillas with the correct ones (ș and ț)
 | 
					# Replaced characters using cedillas with the correct ones (ș and ț)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "ro"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.ro.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ro.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class RomanianDefaults(Language.Defaults):
 | 
					class RomanianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "ro"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
| 
						 | 
					@ -30,6 +43,7 @@ class RomanianDefaults(Language.Defaults):
 | 
				
			||||||
class Romanian(Language):
 | 
					class Romanian(Language):
 | 
				
			||||||
    lang = "ro"
 | 
					    lang = "ro"
 | 
				
			||||||
    Defaults = RomanianDefaults
 | 
					    Defaults = RomanianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Romanian"]
 | 
					__all__ = ["Romanian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,32 +1,49 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .lemmatizer import RussianLemmatizer
 | 
					from .lemmatizer import RussianLemmatizer
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...lookups import Lookups
 | 
					
 | 
				
			||||||
from ...attrs import LANG
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "ru"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.ru.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.RussianLemmatizer.v1"
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ru.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ru.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.lemmatizers("spacy.RussianLemmatizer.v1")
 | 
				
			||||||
 | 
					def create_russian_lemmatizer() -> RussianLemmatizer:
 | 
				
			||||||
 | 
					    return RussianLemmatizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class RussianDefaults(Language.Defaults):
 | 
					class RussianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "ru"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
					 | 
				
			||||||
        if lookups is None:
 | 
					 | 
				
			||||||
            lookups = Lookups()
 | 
					 | 
				
			||||||
        return RussianLemmatizer(lookups)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Russian(Language):
 | 
					class Russian(Language):
 | 
				
			||||||
    lang = "ru"
 | 
					    lang = "ru"
 | 
				
			||||||
    Defaults = RussianDefaults
 | 
					    Defaults = RussianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Russian"]
 | 
					__all__ = ["Russian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,17 @@
 | 
				
			||||||
 | 
					from typing import Optional, Tuple, Dict, List
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
 | 
					from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
 | 
				
			||||||
from ...lemmatizer import Lemmatizer
 | 
					from ...lemmatizer import Lemmatizer
 | 
				
			||||||
 | 
					from ...lookups import Lookups
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PUNCT_RULES = {"«": '"', "»": '"'}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class RussianLemmatizer(Lemmatizer):
 | 
					class RussianLemmatizer(Lemmatizer):
 | 
				
			||||||
    _morph = None
 | 
					    _morph = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, lookups=None):
 | 
					    def __init__(self, lookups: Optional[Lookups] = None) -> None:
 | 
				
			||||||
        super(RussianLemmatizer, self).__init__(lookups)
 | 
					        super(RussianLemmatizer, self).__init__(lookups)
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            from pymorphy2 import MorphAnalyzer
 | 
					            from pymorphy2 import MorphAnalyzer
 | 
				
			||||||
| 
						 | 
					@ -19,15 +25,15 @@ class RussianLemmatizer(Lemmatizer):
 | 
				
			||||||
        if RussianLemmatizer._morph is None:
 | 
					        if RussianLemmatizer._morph is None:
 | 
				
			||||||
            RussianLemmatizer._morph = MorphAnalyzer()
 | 
					            RussianLemmatizer._morph = MorphAnalyzer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
					    def __call__(
 | 
				
			||||||
 | 
					        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        univ_pos = self.normalize_univ_pos(univ_pos)
 | 
					        univ_pos = self.normalize_univ_pos(univ_pos)
 | 
				
			||||||
        if univ_pos == "PUNCT":
 | 
					        if univ_pos == "PUNCT":
 | 
				
			||||||
            return [PUNCT_RULES.get(string, string)]
 | 
					            return [PUNCT_RULES.get(string, string)]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
 | 
					        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
 | 
				
			||||||
            # Skip unchangeable pos
 | 
					            # Skip unchangeable pos
 | 
				
			||||||
            return [string.lower()]
 | 
					            return [string.lower()]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        analyses = self._morph.parse(string)
 | 
					        analyses = self._morph.parse(string)
 | 
				
			||||||
        filtered_analyses = []
 | 
					        filtered_analyses = []
 | 
				
			||||||
        for analysis in analyses:
 | 
					        for analysis in analyses:
 | 
				
			||||||
| 
						 | 
					@ -39,12 +45,10 @@ class RussianLemmatizer(Lemmatizer):
 | 
				
			||||||
                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
 | 
					                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
 | 
				
			||||||
            ):
 | 
					            ):
 | 
				
			||||||
                filtered_analyses.append(analysis)
 | 
					                filtered_analyses.append(analysis)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if not len(filtered_analyses):
 | 
					        if not len(filtered_analyses):
 | 
				
			||||||
            return [string.lower()]
 | 
					            return [string.lower()]
 | 
				
			||||||
        if morphology is None or (len(morphology) == 1 and POS in morphology):
 | 
					        if morphology is None or (len(morphology) == 1 and POS in morphology):
 | 
				
			||||||
            return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
					            return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
 | 
					        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
 | 
				
			||||||
            features_to_compare = ["Case", "Number", "Gender"]
 | 
					            features_to_compare = ["Case", "Number", "Gender"]
 | 
				
			||||||
        elif univ_pos == "NUM":
 | 
					        elif univ_pos == "NUM":
 | 
				
			||||||
| 
						 | 
					@ -61,7 +65,6 @@ class RussianLemmatizer(Lemmatizer):
 | 
				
			||||||
                "VerbForm",
 | 
					                "VerbForm",
 | 
				
			||||||
                "Voice",
 | 
					                "Voice",
 | 
				
			||||||
            ]
 | 
					            ]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        analyses, filtered_analyses = filtered_analyses, []
 | 
					        analyses, filtered_analyses = filtered_analyses, []
 | 
				
			||||||
        for analysis in analyses:
 | 
					        for analysis in analyses:
 | 
				
			||||||
            _, analysis_morph = oc2ud(str(analysis.tag))
 | 
					            _, analysis_morph = oc2ud(str(analysis.tag))
 | 
				
			||||||
| 
						 | 
					@ -74,16 +77,14 @@ class RussianLemmatizer(Lemmatizer):
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                filtered_analyses.append(analysis)
 | 
					                filtered_analyses.append(analysis)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if not len(filtered_analyses):
 | 
					        if not len(filtered_analyses):
 | 
				
			||||||
            return [string.lower()]
 | 
					            return [string.lower()]
 | 
				
			||||||
        return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
					        return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def normalize_univ_pos(univ_pos):
 | 
					    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
 | 
				
			||||||
        if isinstance(univ_pos, str):
 | 
					        if isinstance(univ_pos, str):
 | 
				
			||||||
            return univ_pos.upper()
 | 
					            return univ_pos.upper()
 | 
				
			||||||
 | 
					 | 
				
			||||||
        symbols_to_str = {
 | 
					        symbols_to_str = {
 | 
				
			||||||
            ADJ: "ADJ",
 | 
					            ADJ: "ADJ",
 | 
				
			||||||
            DET: "DET",
 | 
					            DET: "DET",
 | 
				
			||||||
| 
						 | 
					@ -98,14 +99,14 @@ class RussianLemmatizer(Lemmatizer):
 | 
				
			||||||
            return symbols_to_str[univ_pos]
 | 
					            return symbols_to_str[univ_pos]
 | 
				
			||||||
        return None
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lookup(self, string, orth=None):
 | 
					    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
				
			||||||
        analyses = self._morph.parse(string)
 | 
					        analyses = self._morph.parse(string)
 | 
				
			||||||
        if len(analyses) == 1:
 | 
					        if len(analyses) == 1:
 | 
				
			||||||
            return analyses[0].normal_form
 | 
					            return analyses[0].normal_form
 | 
				
			||||||
        return string
 | 
					        return string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def oc2ud(oc_tag):
 | 
					def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
 | 
				
			||||||
    gram_map = {
 | 
					    gram_map = {
 | 
				
			||||||
        "_POS": {
 | 
					        "_POS": {
 | 
				
			||||||
            "ADJF": "ADJ",
 | 
					            "ADJF": "ADJ",
 | 
				
			||||||
| 
						 | 
					@ -160,11 +161,9 @@ def oc2ud(oc_tag):
 | 
				
			||||||
        "Voice": {"actv": "Act", "pssv": "Pass"},
 | 
					        "Voice": {"actv": "Act", "pssv": "Pass"},
 | 
				
			||||||
        "Abbr": {"Abbr": "Yes"},
 | 
					        "Abbr": {"Abbr": "Yes"},
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					 | 
				
			||||||
    pos = "X"
 | 
					    pos = "X"
 | 
				
			||||||
    morphology = dict()
 | 
					    morphology = dict()
 | 
				
			||||||
    unmatched = set()
 | 
					    unmatched = set()
 | 
				
			||||||
 | 
					 | 
				
			||||||
    grams = oc_tag.replace(" ", ",").split(",")
 | 
					    grams = oc_tag.replace(" ", ",").split(",")
 | 
				
			||||||
    for gram in grams:
 | 
					    for gram in grams:
 | 
				
			||||||
        match = False
 | 
					        match = False
 | 
				
			||||||
| 
						 | 
					@ -177,7 +176,6 @@ def oc2ud(oc_tag):
 | 
				
			||||||
                    morphology[categ] = gmap[gram]
 | 
					                    morphology[categ] = gmap[gram]
 | 
				
			||||||
        if not match:
 | 
					        if not match:
 | 
				
			||||||
            unmatched.add(gram)
 | 
					            unmatched.add(gram)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    while len(unmatched) > 0:
 | 
					    while len(unmatched) > 0:
 | 
				
			||||||
        gram = unmatched.pop()
 | 
					        gram = unmatched.pop()
 | 
				
			||||||
        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
 | 
					        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
 | 
				
			||||||
| 
						 | 
					@ -186,8 +184,4 @@ def oc2ud(oc_tag):
 | 
				
			||||||
            pos = "AUX"
 | 
					            pos = "AUX"
 | 
				
			||||||
        elif gram == "Pltm":
 | 
					        elif gram == "Pltm":
 | 
				
			||||||
            morphology["Number"] = "Ptan"
 | 
					            morphology["Number"] = "Ptan"
 | 
				
			||||||
 | 
					 | 
				
			||||||
    return pos, morphology
 | 
					    return pos, morphology
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
PUNCT_RULES = {"«": '"', "»": '"'}
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,33 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SinhalaDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					lang = "si"
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "si"
 | 
					stop_words = {"@language_data": "spacy.si.stop_words"}
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.si.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.si.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Sinhala(Language):
 | 
					class Sinhala(Language):
 | 
				
			||||||
    lang = "si"
 | 
					    lang = "si"
 | 
				
			||||||
    Defaults = SinhalaDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Sinhala"]
 | 
					__all__ = ["Sinhala"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,33 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SlovakDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					lang = "sk"
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "sk"
 | 
					stop_words = {"@language_data": "spacy.sk.stop_words"}
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.sk.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.sk.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Slovak(Language):
 | 
					class Slovak(Language):
 | 
				
			||||||
    lang = "sk"
 | 
					    lang = "sk"
 | 
				
			||||||
    Defaults = SlovakDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Slovak"]
 | 
					__all__ = ["Slovak"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SlovenianDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "sl"
 | 
					lang = "sl"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					stop_words = {"@language_data": "spacy.sl.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.sl.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Slovenian(Language):
 | 
					class Slovenian(Language):
 | 
				
			||||||
    lang = "sl"
 | 
					    lang = "sl"
 | 
				
			||||||
    Defaults = SlovenianDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Slovenian"]
 | 
					__all__ = ["Slovenian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AlbanianDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "sq"
 | 
					lang = "sq"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					stop_words = {"@language_data": "spacy.sq.stop_words"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.sq.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Albanian(Language):
 | 
					class Albanian(Language):
 | 
				
			||||||
    lang = "sq"
 | 
					    lang = "sq"
 | 
				
			||||||
    Defaults = AlbanianDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Albanian"]
 | 
					__all__ = ["Albanian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,23 +1,47 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "sr"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.sr.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.sr.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.sr.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SerbianDefaults(Language.Defaults):
 | 
					class SerbianDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "sr"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Serbian(Language):
 | 
					class Serbian(Language):
 | 
				
			||||||
    lang = "sr"
 | 
					    lang = "sr"
 | 
				
			||||||
    Defaults = SerbianDefaults
 | 
					    Defaults = SerbianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Serbian"]
 | 
					__all__ = ["Serbian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,35 +1,54 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...language import Language
 | 
				
			||||||
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Punctuation stolen from Danish
 | 
					# Punctuation stolen from Danish
 | 
				
			||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
from ...language import Language
 | 
					[nlp]
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					lang = "sv"
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					stop_words = {"@language_data": "spacy.sv.stop_words"}
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.sv.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.sv.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SwedishDefaults(Language.Defaults):
 | 
					class SwedishDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "sv"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Swedish(Language):
 | 
					class Swedish(Language):
 | 
				
			||||||
    lang = "sv"
 | 
					    lang = "sv"
 | 
				
			||||||
    Defaults = SwedishDefaults
 | 
					    Defaults = SwedishDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Swedish"]
 | 
					__all__ = ["Swedish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,33 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TamilDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "ta"
 | 
					lang = "ta"
 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					stop_words = {"@language_data": "spacy.ta.stop_words"}
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ta.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ta.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Tamil(Language):
 | 
					class Tamil(Language):
 | 
				
			||||||
    lang = "ta"
 | 
					    lang = "ta"
 | 
				
			||||||
    Defaults = TamilDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Tamil"]
 | 
					__all__ = ["Tamil"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,33 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TeluguDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					lang = "te"
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "te"
 | 
					stop_words = {"@language_data": "spacy.te.stop_words"}
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.te.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.te.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Telugu(Language):
 | 
					class Telugu(Language):
 | 
				
			||||||
    lang = "te"
 | 
					    lang = "te"
 | 
				
			||||||
    Defaults = TeluguDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Telugu"]
 | 
					__all__ = ["Telugu"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,44 @@
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...attrs import LANG
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from ...util import DummyTokenizer
 | 
					from ...util import DummyTokenizer, registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "th"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.th.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.tokenizer]
 | 
				
			||||||
 | 
					@tokenizers = "spacy.ThaiTokenizer.v1"
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.th.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.th.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.tokenizers("spacy.ThaiTokenizer.v1")
 | 
				
			||||||
 | 
					def create_thai_tokenizer():
 | 
				
			||||||
 | 
					    def thai_tokenizer_factory(nlp):
 | 
				
			||||||
 | 
					        return ThaiTokenizer(nlp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return thai_tokenizer_factory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ThaiTokenizer(DummyTokenizer):
 | 
					class ThaiTokenizer(DummyTokenizer):
 | 
				
			||||||
    def __init__(self, cls, nlp=None):
 | 
					    def __init__(self, nlp: Language) -> None:
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            from pythainlp.tokenize import word_tokenize
 | 
					            from pythainlp.tokenize import word_tokenize
 | 
				
			||||||
        except ImportError:
 | 
					        except ImportError:
 | 
				
			||||||
| 
						 | 
					@ -17,34 +46,18 @@ class ThaiTokenizer(DummyTokenizer):
 | 
				
			||||||
                "The Thai tokenizer requires the PyThaiNLP library: "
 | 
					                "The Thai tokenizer requires the PyThaiNLP library: "
 | 
				
			||||||
                "https://github.com/PyThaiNLP/pythainlp"
 | 
					                "https://github.com/PyThaiNLP/pythainlp"
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					 | 
				
			||||||
        self.word_tokenize = word_tokenize
 | 
					        self.word_tokenize = word_tokenize
 | 
				
			||||||
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
					        self.vocab = nlp.vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, text):
 | 
					    def __call__(self, text: str) -> Doc:
 | 
				
			||||||
        words = list(self.word_tokenize(text))
 | 
					        words = list(self.word_tokenize(text))
 | 
				
			||||||
        spaces = [False] * len(words)
 | 
					        spaces = [False] * len(words)
 | 
				
			||||||
        return Doc(self.vocab, words=words, spaces=spaces)
 | 
					        return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ThaiDefaults(Language.Defaults):
 | 
					 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda _text: "th"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
 | 
					 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def create_tokenizer(cls, nlp=None):
 | 
					 | 
				
			||||||
        return ThaiTokenizer(cls, nlp)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class Thai(Language):
 | 
					class Thai(Language):
 | 
				
			||||||
    lang = "th"
 | 
					    lang = "th"
 | 
				
			||||||
    Defaults = ThaiDefaults
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def make_doc(self, text):
 | 
					 | 
				
			||||||
        return self.tokenizer(text)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Thai"]
 | 
					__all__ = ["Thai"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,31 +1,47 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _return_tl(_):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    return "tl"
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "tl"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.tl.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.tl.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.tl.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TagalogDefaults(Language.Defaults):
 | 
					class TagalogDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = _return_tl
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Tagalog(Language):
 | 
					class Tagalog(Language):
 | 
				
			||||||
    lang = "tl"
 | 
					    lang = "tl"
 | 
				
			||||||
    Defaults = TagalogDefaults
 | 
					    Defaults = TagalogDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Tagalog"]
 | 
					__all__ = ["Tagalog"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,40 @@
 | 
				
			||||||
 | 
					from typing import Set
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "tr"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.tr.stop_words"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.tr.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TurkishDefaults(Language.Defaults):
 | 
					class TurkishDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "tr"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Turkish(Language):
 | 
					class Turkish(Language):
 | 
				
			||||||
    lang = "tr"
 | 
					    lang = "tr"
 | 
				
			||||||
    Defaults = TurkishDefaults
 | 
					    Defaults = TurkishDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Turkish"]
 | 
					__all__ = ["Turkish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,28 +1,42 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...attrs import LANG
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "tt"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.tt.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.tt.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.tt.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TatarDefaults(Language.Defaults):
 | 
					class TatarDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "tt"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
    infixes = tuple(TOKENIZER_INFIXES)
 | 
					    infixes = tuple(TOKENIZER_INFIXES)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Tatar(Language):
 | 
					class Tatar(Language):
 | 
				
			||||||
    lang = "tt"
 | 
					    lang = "tt"
 | 
				
			||||||
    Defaults = TatarDefaults
 | 
					    Defaults = TatarDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Tatar"]
 | 
					__all__ = ["Tatar"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,36 +1,49 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					from ...util import update_exc, registry
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...lookups import Lookups
 | 
					 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					 | 
				
			||||||
from .lemmatizer import UkrainianLemmatizer
 | 
					from .lemmatizer import UkrainianLemmatizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class UkrainianDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "uk"
 | 
					lang = "uk"
 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					stop_words = {"@language_data": "spacy.uk.stop_words"}
 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
    def create_lemmatizer(cls, nlp=None, lookups=None):
 | 
					@lemmatizers = "spacy.UkrainianLemmatizer.v1"
 | 
				
			||||||
        if lookups is None:
 | 
					"""
 | 
				
			||||||
            lookups = Lookups()
 | 
					
 | 
				
			||||||
        return UkrainianLemmatizer(lookups)
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.uk.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.uk.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
 | 
				
			||||||
 | 
					def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
 | 
				
			||||||
 | 
					    return UkrainianLemmatizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class UkrainianDefaults(Language.Defaults):
 | 
				
			||||||
 | 
					    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Ukrainian(Language):
 | 
					class Ukrainian(Language):
 | 
				
			||||||
    lang = "uk"
 | 
					    lang = "uk"
 | 
				
			||||||
    Defaults = UkrainianDefaults
 | 
					    Defaults = UkrainianDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Ukrainian"]
 | 
					__all__ = ["Ukrainian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,17 @@
 | 
				
			||||||
 | 
					from typing import Optional, List, Tuple, Dict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
 | 
					from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
 | 
				
			||||||
 | 
					from ...lookups import Lookups
 | 
				
			||||||
from ...lemmatizer import Lemmatizer
 | 
					from ...lemmatizer import Lemmatizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PUNCT_RULES = {"«": '"', "»": '"'}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class UkrainianLemmatizer(Lemmatizer):
 | 
					class UkrainianLemmatizer(Lemmatizer):
 | 
				
			||||||
    _morph = None
 | 
					    _morph = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, lookups=None):
 | 
					    def __init__(self, lookups: Optional[Lookups] = None) -> None:
 | 
				
			||||||
        super(UkrainianLemmatizer, self).__init__(lookups)
 | 
					        super(UkrainianLemmatizer, self).__init__(lookups)
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            from pymorphy2 import MorphAnalyzer
 | 
					            from pymorphy2 import MorphAnalyzer
 | 
				
			||||||
| 
						 | 
					@ -19,15 +25,15 @@ class UkrainianLemmatizer(Lemmatizer):
 | 
				
			||||||
                '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
 | 
					                '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
					    def __call__(
 | 
				
			||||||
 | 
					        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        univ_pos = self.normalize_univ_pos(univ_pos)
 | 
					        univ_pos = self.normalize_univ_pos(univ_pos)
 | 
				
			||||||
        if univ_pos == "PUNCT":
 | 
					        if univ_pos == "PUNCT":
 | 
				
			||||||
            return [PUNCT_RULES.get(string, string)]
 | 
					            return [PUNCT_RULES.get(string, string)]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
 | 
					        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
 | 
				
			||||||
            # Skip unchangeable pos
 | 
					            # Skip unchangeable pos
 | 
				
			||||||
            return [string.lower()]
 | 
					            return [string.lower()]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        analyses = self._morph.parse(string)
 | 
					        analyses = self._morph.parse(string)
 | 
				
			||||||
        filtered_analyses = []
 | 
					        filtered_analyses = []
 | 
				
			||||||
        for analysis in analyses:
 | 
					        for analysis in analyses:
 | 
				
			||||||
| 
						 | 
					@ -39,12 +45,10 @@ class UkrainianLemmatizer(Lemmatizer):
 | 
				
			||||||
                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
 | 
					                analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
 | 
				
			||||||
            ):
 | 
					            ):
 | 
				
			||||||
                filtered_analyses.append(analysis)
 | 
					                filtered_analyses.append(analysis)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if not len(filtered_analyses):
 | 
					        if not len(filtered_analyses):
 | 
				
			||||||
            return [string.lower()]
 | 
					            return [string.lower()]
 | 
				
			||||||
        if morphology is None or (len(morphology) == 1 and POS in morphology):
 | 
					        if morphology is None or (len(morphology) == 1 and POS in morphology):
 | 
				
			||||||
            return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
					            return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
 | 
					        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
 | 
				
			||||||
            features_to_compare = ["Case", "Number", "Gender"]
 | 
					            features_to_compare = ["Case", "Number", "Gender"]
 | 
				
			||||||
        elif univ_pos == "NUM":
 | 
					        elif univ_pos == "NUM":
 | 
				
			||||||
| 
						 | 
					@ -61,7 +65,6 @@ class UkrainianLemmatizer(Lemmatizer):
 | 
				
			||||||
                "VerbForm",
 | 
					                "VerbForm",
 | 
				
			||||||
                "Voice",
 | 
					                "Voice",
 | 
				
			||||||
            ]
 | 
					            ]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        analyses, filtered_analyses = filtered_analyses, []
 | 
					        analyses, filtered_analyses = filtered_analyses, []
 | 
				
			||||||
        for analysis in analyses:
 | 
					        for analysis in analyses:
 | 
				
			||||||
            _, analysis_morph = oc2ud(str(analysis.tag))
 | 
					            _, analysis_morph = oc2ud(str(analysis.tag))
 | 
				
			||||||
| 
						 | 
					@ -74,16 +77,14 @@ class UkrainianLemmatizer(Lemmatizer):
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                filtered_analyses.append(analysis)
 | 
					                filtered_analyses.append(analysis)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if not len(filtered_analyses):
 | 
					        if not len(filtered_analyses):
 | 
				
			||||||
            return [string.lower()]
 | 
					            return [string.lower()]
 | 
				
			||||||
        return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
					        return list(set([analysis.normal_form for analysis in filtered_analyses]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def normalize_univ_pos(univ_pos):
 | 
					    def normalize_univ_pos(univ_pos: str) -> Optional[str]:
 | 
				
			||||||
        if isinstance(univ_pos, str):
 | 
					        if isinstance(univ_pos, str):
 | 
				
			||||||
            return univ_pos.upper()
 | 
					            return univ_pos.upper()
 | 
				
			||||||
 | 
					 | 
				
			||||||
        symbols_to_str = {
 | 
					        symbols_to_str = {
 | 
				
			||||||
            ADJ: "ADJ",
 | 
					            ADJ: "ADJ",
 | 
				
			||||||
            DET: "DET",
 | 
					            DET: "DET",
 | 
				
			||||||
| 
						 | 
					@ -98,14 +99,14 @@ class UkrainianLemmatizer(Lemmatizer):
 | 
				
			||||||
            return symbols_to_str[univ_pos]
 | 
					            return symbols_to_str[univ_pos]
 | 
				
			||||||
        return None
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lookup(self, string, orth=None):
 | 
					    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
				
			||||||
        analyses = self._morph.parse(string)
 | 
					        analyses = self._morph.parse(string)
 | 
				
			||||||
        if len(analyses) == 1:
 | 
					        if len(analyses) == 1:
 | 
				
			||||||
            return analyses[0].normal_form
 | 
					            return analyses[0].normal_form
 | 
				
			||||||
        return string
 | 
					        return string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def oc2ud(oc_tag):
 | 
					def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
 | 
				
			||||||
    gram_map = {
 | 
					    gram_map = {
 | 
				
			||||||
        "_POS": {
 | 
					        "_POS": {
 | 
				
			||||||
            "ADJF": "ADJ",
 | 
					            "ADJF": "ADJ",
 | 
				
			||||||
| 
						 | 
					@ -160,11 +161,9 @@ def oc2ud(oc_tag):
 | 
				
			||||||
        "Voice": {"actv": "Act", "pssv": "Pass"},
 | 
					        "Voice": {"actv": "Act", "pssv": "Pass"},
 | 
				
			||||||
        "Abbr": {"Abbr": "Yes"},
 | 
					        "Abbr": {"Abbr": "Yes"},
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					 | 
				
			||||||
    pos = "X"
 | 
					    pos = "X"
 | 
				
			||||||
    morphology = dict()
 | 
					    morphology = dict()
 | 
				
			||||||
    unmatched = set()
 | 
					    unmatched = set()
 | 
				
			||||||
 | 
					 | 
				
			||||||
    grams = oc_tag.replace(" ", ",").split(",")
 | 
					    grams = oc_tag.replace(" ", ",").split(",")
 | 
				
			||||||
    for gram in grams:
 | 
					    for gram in grams:
 | 
				
			||||||
        match = False
 | 
					        match = False
 | 
				
			||||||
| 
						 | 
					@ -177,7 +176,6 @@ def oc2ud(oc_tag):
 | 
				
			||||||
                    morphology[categ] = gmap[gram]
 | 
					                    morphology[categ] = gmap[gram]
 | 
				
			||||||
        if not match:
 | 
					        if not match:
 | 
				
			||||||
            unmatched.add(gram)
 | 
					            unmatched.add(gram)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    while len(unmatched) > 0:
 | 
					    while len(unmatched) > 0:
 | 
				
			||||||
        gram = unmatched.pop()
 | 
					        gram = unmatched.pop()
 | 
				
			||||||
        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
 | 
					        if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
 | 
				
			||||||
| 
						 | 
					@ -186,8 +184,4 @@ def oc2ud(oc_tag):
 | 
				
			||||||
            pos = "AUX"
 | 
					            pos = "AUX"
 | 
				
			||||||
        elif gram == "Pltm":
 | 
					        elif gram == "Pltm":
 | 
				
			||||||
            morphology["Number"] = "Ptan"
 | 
					            morphology["Number"] = "Ptan"
 | 
				
			||||||
 | 
					 | 
				
			||||||
    return pos, morphology
 | 
					    return pos, morphology
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
PUNCT_RULES = {"«": '"', "»": '"'}
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,53 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "ur"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.ur.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.writing_system]
 | 
				
			||||||
 | 
					direction = "rtl"
 | 
				
			||||||
 | 
					has_case = false
 | 
				
			||||||
 | 
					has_letters = true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer.data_paths]
 | 
				
			||||||
 | 
					@language_data = "spacy-lookups-data"
 | 
				
			||||||
 | 
					lang = ${nlp:lang}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ur.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.ur.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class UrduDefaults(Language.Defaults):
 | 
					class UrduDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "ur"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
					    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Urdu(Language):
 | 
					class Urdu(Language):
 | 
				
			||||||
    lang = "ur"
 | 
					    lang = "ur"
 | 
				
			||||||
    Defaults = UrduDefaults
 | 
					    Defaults = UrduDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Urdu"]
 | 
					__all__ = ["Urdu"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,38 +1,62 @@
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...util import add_lookups
 | 
					from ...util import DummyTokenizer, registry
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class VietnameseDefaults(Language.Defaults):
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					[nlp]
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "vi"  # for pickling
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    use_pyvi = True
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class Vietnamese(Language):
 | 
					 | 
				
			||||||
lang = "vi"
 | 
					lang = "vi"
 | 
				
			||||||
    Defaults = VietnameseDefaults  # override defaults
 | 
					stop_words = {"@language_data": "spacy.vi.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def make_doc(self, text):
 | 
					[nlp.tokenizer]
 | 
				
			||||||
        if self.Defaults.use_pyvi:
 | 
					@tokenizers = "spacy.VietnameseTokenizer.v1"
 | 
				
			||||||
 | 
					use_pyvi = true
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.vi.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.vi.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.tokenizers("spacy.VietnameseTokenizer.v1")
 | 
				
			||||||
 | 
					def create_vietnamese_tokenizer(use_pyvi: bool = True,):
 | 
				
			||||||
 | 
					    def vietnamese_tokenizer_factory(nlp):
 | 
				
			||||||
 | 
					        return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return vietnamese_tokenizer_factory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class VietnameseTokenizer(DummyTokenizer):
 | 
				
			||||||
 | 
					    def __init__(self, nlp: Language, use_pyvi: bool = False):
 | 
				
			||||||
 | 
					        self.vocab = nlp.vocab
 | 
				
			||||||
 | 
					        self.use_pyvi = use_pyvi
 | 
				
			||||||
 | 
					        if self.use_pyvi:
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                from pyvi import ViTokenizer
 | 
					                from pyvi import ViTokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                self.ViTokenizer = ViTokenizer
 | 
				
			||||||
            except ImportError:
 | 
					            except ImportError:
 | 
				
			||||||
                msg = (
 | 
					                msg = (
 | 
				
			||||||
                    "Pyvi not installed. Either set Vietnamese.use_pyvi = False, "
 | 
					                    "Pyvi not installed. Either set use_pyvi = False, "
 | 
				
			||||||
                    "or install it https://pypi.python.org/pypi/pyvi"
 | 
					                    "or install it https://pypi.python.org/pypi/pyvi"
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                raise ImportError(msg)
 | 
					                raise ImportError(msg)
 | 
				
			||||||
            words, spaces = ViTokenizer.spacy_tokenize(text)
 | 
					
 | 
				
			||||||
 | 
					    def __call__(self, text: str) -> Doc:
 | 
				
			||||||
 | 
					        if self.use_pyvi:
 | 
				
			||||||
 | 
					            words, spaces = self.ViTokenizer.spacy_tokenize(text)
 | 
				
			||||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
					            return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            words = []
 | 
					            words = []
 | 
				
			||||||
| 
						 | 
					@ -44,4 +68,9 @@ class Vietnamese(Language):
 | 
				
			||||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
					            return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Vietnamese(Language):
 | 
				
			||||||
 | 
					    lang = "vi"
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Vietnamese"]
 | 
					__all__ = ["Vietnamese"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,17 @@
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG, NORM
 | 
					
 | 
				
			||||||
from ...util import update_exc, add_lookups
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "xx"
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class MultiLanguageDefaults(Language.Defaults):
 | 
					class MultiLanguageDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "xx"
 | 
					 | 
				
			||||||
    lex_attr_getters[NORM] = add_lookups(
 | 
					 | 
				
			||||||
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class MultiLanguage(Language):
 | 
					class MultiLanguage(Language):
 | 
				
			||||||
| 
						 | 
					@ -21,6 +21,7 @@ class MultiLanguage(Language):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    lang = "xx"
 | 
					    lang = "xx"
 | 
				
			||||||
    Defaults = MultiLanguageDefaults
 | 
					    Defaults = MultiLanguageDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["MultiLanguage"]
 | 
					__all__ = ["MultiLanguage"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,21 +1,39 @@
 | 
				
			||||||
 | 
					from typing import Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "si"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.yo.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.yo.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.yo.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class YorubaDefaults(Language.Defaults):
 | 
					class YorubaDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "yo"
 | 
					 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
					    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Yoruba(Language):
 | 
					class Yoruba(Language):
 | 
				
			||||||
    lang = "yo"
 | 
					    lang = "yo"
 | 
				
			||||||
    Defaults = YorubaDefaults
 | 
					    Defaults = YorubaDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Yoruba"]
 | 
					__all__ = ["Yoruba"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,15 @@
 | 
				
			||||||
 | 
					from typing import Optional, List, Set, Dict, Callable, Any
 | 
				
			||||||
 | 
					from enum import Enum
 | 
				
			||||||
import tempfile
 | 
					import tempfile
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from collections import OrderedDict
 | 
					from thinc.api import Config
 | 
				
			||||||
from ...attrs import LANG
 | 
					
 | 
				
			||||||
from ...errors import Warnings, Errors
 | 
					from ...errors import Warnings, Errors
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from ...util import DummyTokenizer
 | 
					from ...util import DummyTokenizer, registry
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
| 
						 | 
					@ -16,88 +18,103 @@ from ... import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
 | 
					_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "zh"
 | 
				
			||||||
 | 
					stop_words = {"@language_data": "spacy.zh.stop_words"}
 | 
				
			||||||
 | 
					lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def try_jieba_import(segmenter):
 | 
					[nlp.tokenizer]
 | 
				
			||||||
    try:
 | 
					@tokenizers = "spacy.ChineseTokenizer.v1"
 | 
				
			||||||
        import jieba
 | 
					segmenter = "char"
 | 
				
			||||||
 | 
					pkuseg_model = null
 | 
				
			||||||
 | 
					pkuseg_user_dict = "default"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if segmenter == "jieba":
 | 
					[nlp.writing_system]
 | 
				
			||||||
            # segment a short text to have jieba initialize its cache in advance
 | 
					direction = "ltr"
 | 
				
			||||||
            list(jieba.cut("作为", cut_all=False))
 | 
					has_case = false
 | 
				
			||||||
 | 
					has_letters = false
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return jieba
 | 
					
 | 
				
			||||||
    except ImportError:
 | 
					class Segmenter(str, Enum):
 | 
				
			||||||
        if segmenter == "jieba":
 | 
					    char = "char"
 | 
				
			||||||
            msg = (
 | 
					    jieba = "jieba"
 | 
				
			||||||
                "Jieba not installed. To use jieba, install it with `pip "
 | 
					    pkuseg = "pkuseg"
 | 
				
			||||||
                " install jieba` or from https://github.com/fxsjy/jieba"
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def values(cls):
 | 
				
			||||||
 | 
					        return list(cls.__members__.keys())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.zh.stop_words")
 | 
				
			||||||
 | 
					def stop_words() -> Set[str]:
 | 
				
			||||||
 | 
					    return STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy.zh.lex_attr_getters")
 | 
				
			||||||
 | 
					def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
				
			||||||
 | 
					    return LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.tokenizers("spacy.ChineseTokenizer.v1")
 | 
				
			||||||
 | 
					def create_chinese_tokenizer(
 | 
				
			||||||
 | 
					    segmenter: Segmenter = Segmenter.char,
 | 
				
			||||||
 | 
					    pkuseg_model: Optional[str] = None,
 | 
				
			||||||
 | 
					    pkuseg_user_dict: Optional[str] = "default",
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    def chinese_tokenizer_factory(nlp):
 | 
				
			||||||
 | 
					        return ChineseTokenizer(
 | 
				
			||||||
 | 
					            nlp,
 | 
				
			||||||
 | 
					            segmenter=segmenter,
 | 
				
			||||||
 | 
					            pkuseg_model=pkuseg_model,
 | 
				
			||||||
 | 
					            pkuseg_user_dict=pkuseg_user_dict,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
            raise ImportError(msg)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return chinese_tokenizer_factory
 | 
				
			||||||
def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        import pkuseg
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if pkuseg_model:
 | 
					 | 
				
			||||||
            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
 | 
					 | 
				
			||||||
        elif segmenter == "pkuseg":
 | 
					 | 
				
			||||||
            msg = (
 | 
					 | 
				
			||||||
                "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
 | 
					 | 
				
			||||||
                "was specified. Please provide the name of a pretrained model "
 | 
					 | 
				
			||||||
                "or the path to a model with "
 | 
					 | 
				
			||||||
                '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
 | 
					 | 
				
			||||||
                'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            raise ValueError(msg)
 | 
					 | 
				
			||||||
    except ImportError:
 | 
					 | 
				
			||||||
        if segmenter == "pkuseg":
 | 
					 | 
				
			||||||
            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
 | 
					 | 
				
			||||||
            raise ImportError(msg)
 | 
					 | 
				
			||||||
    except FileNotFoundError:
 | 
					 | 
				
			||||||
        if segmenter == "pkuseg":
 | 
					 | 
				
			||||||
            msg = "Unable to load pkuseg model from: " + pkuseg_model
 | 
					 | 
				
			||||||
            raise FileNotFoundError(msg)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ChineseTokenizer(DummyTokenizer):
 | 
					class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
    def __init__(self, cls, nlp=None, config={}):
 | 
					    def __init__(
 | 
				
			||||||
        self.supported_segmenters = ("char", "jieba", "pkuseg")
 | 
					        self,
 | 
				
			||||||
        self.configure_segmenter(config)
 | 
					        nlp: Language,
 | 
				
			||||||
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
					        segmenter: Segmenter = Segmenter.char,
 | 
				
			||||||
        # remove relevant settings from config so they're not also saved in
 | 
					        pkuseg_model: Optional[str] = None,
 | 
				
			||||||
        # Language.meta
 | 
					        pkuseg_user_dict: Optional[str] = None,
 | 
				
			||||||
        for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
 | 
					    ):
 | 
				
			||||||
            if key in config:
 | 
					        self.vocab = nlp.vocab
 | 
				
			||||||
                del config[key]
 | 
					        if isinstance(segmenter, Segmenter):  # we might have the Enum here
 | 
				
			||||||
        self.tokenizer = Language.Defaults().create_tokenizer(nlp)
 | 
					            segmenter = segmenter.value
 | 
				
			||||||
 | 
					        self.segmenter = segmenter
 | 
				
			||||||
 | 
					        self.pkuseg_model = pkuseg_model
 | 
				
			||||||
 | 
					        self.pkuseg_user_dict = pkuseg_user_dict
 | 
				
			||||||
 | 
					        self.pkuseg_seg = None
 | 
				
			||||||
 | 
					        self.jieba_seg = None
 | 
				
			||||||
 | 
					        self.configure_segmenter(segmenter)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def configure_segmenter(self, config):
 | 
					    def configure_segmenter(self, segmenter: str):
 | 
				
			||||||
        self.segmenter = "char"
 | 
					        if segmenter not in Segmenter.values():
 | 
				
			||||||
        if "segmenter" in config:
 | 
					 | 
				
			||||||
            if config["segmenter"] in self.supported_segmenters:
 | 
					 | 
				
			||||||
                self.segmenter = config["segmenter"]
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
            warn_msg = Warnings.W103.format(
 | 
					            warn_msg = Warnings.W103.format(
 | 
				
			||||||
                lang="Chinese",
 | 
					                lang="Chinese",
 | 
				
			||||||
                    segmenter=config["segmenter"],
 | 
					                segmenter=segmenter,
 | 
				
			||||||
                    supported=", ".join([repr(s) for s in self.supported_segmenters]),
 | 
					                supported=", ".join(Segmenter.values()),
 | 
				
			||||||
                default="'char' (character segmentation)",
 | 
					                default="'char' (character segmentation)",
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            warnings.warn(warn_msg)
 | 
					            warnings.warn(warn_msg)
 | 
				
			||||||
 | 
					            self.segmenter = Segmenter.char
 | 
				
			||||||
        self.jieba_seg = try_jieba_import(self.segmenter)
 | 
					        self.jieba_seg = try_jieba_import(self.segmenter)
 | 
				
			||||||
        self.pkuseg_seg = try_pkuseg_import(
 | 
					        self.pkuseg_seg = try_pkuseg_import(
 | 
				
			||||||
            self.segmenter,
 | 
					            self.segmenter,
 | 
				
			||||||
            pkuseg_model=config.get("pkuseg_model", None),
 | 
					            pkuseg_model=self.pkuseg_model,
 | 
				
			||||||
            pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
 | 
					            pkuseg_user_dict=self.pkuseg_user_dict,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, text):
 | 
					    def __call__(self, text: str) -> Doc:
 | 
				
			||||||
        if self.segmenter == "jieba":
 | 
					        if self.segmenter == Segmenter.jieba:
 | 
				
			||||||
            words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
 | 
					            words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
 | 
				
			||||||
            (words, spaces) = util.get_words_and_spaces(words, text)
 | 
					            (words, spaces) = util.get_words_and_spaces(words, text)
 | 
				
			||||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
					            return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
        elif self.segmenter == "pkuseg":
 | 
					        elif self.segmenter == Segmenter.pkuseg:
 | 
				
			||||||
            if self.pkuseg_seg is None:
 | 
					            if self.pkuseg_seg is None:
 | 
				
			||||||
                raise ValueError(Errors.E1000)
 | 
					                raise ValueError(Errors.E1000)
 | 
				
			||||||
            words = self.pkuseg_seg.cut(text)
 | 
					            words = self.pkuseg_seg.cut(text)
 | 
				
			||||||
| 
						 | 
					@ -105,11 +122,11 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
					            return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # warn if segmenter setting is not the only remaining option "char"
 | 
					        # warn if segmenter setting is not the only remaining option "char"
 | 
				
			||||||
        if self.segmenter != "char":
 | 
					        if self.segmenter != Segmenter.char:
 | 
				
			||||||
            warn_msg = Warnings.W103.format(
 | 
					            warn_msg = Warnings.W103.format(
 | 
				
			||||||
                lang="Chinese",
 | 
					                lang="Chinese",
 | 
				
			||||||
                segmenter=self.segmenter,
 | 
					                segmenter=self.segmenter,
 | 
				
			||||||
                supported=", ".join([repr(s) for s in self.supported_segmenters]),
 | 
					                supported=", ".join(Segmenter.values()),
 | 
				
			||||||
                default="'char' (character segmentation)",
 | 
					                default="'char' (character segmentation)",
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            warnings.warn(warn_msg)
 | 
					            warnings.warn(warn_msg)
 | 
				
			||||||
| 
						 | 
					@ -119,15 +136,14 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
        (words, spaces) = util.get_words_and_spaces(words, text)
 | 
					        (words, spaces) = util.get_words_and_spaces(words, text)
 | 
				
			||||||
        return Doc(self.vocab, words=words, spaces=spaces)
 | 
					        return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pkuseg_update_user_dict(self, words, reset=False):
 | 
					    def pkuseg_update_user_dict(self, words: List[str], reset: bool = False):
 | 
				
			||||||
        if self.segmenter == "pkuseg":
 | 
					        if self.segmenter == Segmenter.pkuseg:
 | 
				
			||||||
            if reset:
 | 
					            if reset:
 | 
				
			||||||
                try:
 | 
					                try:
 | 
				
			||||||
                    import pkuseg
 | 
					                    import pkuseg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
 | 
					                    self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
 | 
				
			||||||
                except ImportError:
 | 
					                except ImportError:
 | 
				
			||||||
                    if self.segmenter == "pkuseg":
 | 
					 | 
				
			||||||
                    msg = (
 | 
					                    msg = (
 | 
				
			||||||
                        "pkuseg not installed: unable to reset pkuseg "
 | 
					                        "pkuseg not installed: unable to reset pkuseg "
 | 
				
			||||||
                        "user dict. Please " + _PKUSEG_INSTALL_MSG
 | 
					                        "user dict. Please " + _PKUSEG_INSTALL_MSG
 | 
				
			||||||
| 
						 | 
					@ -139,13 +155,6 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
            warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
 | 
					            warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
 | 
				
			||||||
            warnings.warn(warn_msg)
 | 
					            warnings.warn(warn_msg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _get_config(self):
 | 
					 | 
				
			||||||
        config = OrderedDict((("segmenter", self.segmenter),))
 | 
					 | 
				
			||||||
        return config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _set_config(self, config={}):
 | 
					 | 
				
			||||||
        self.configure_segmenter(config)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def to_bytes(self, **kwargs):
 | 
					    def to_bytes(self, **kwargs):
 | 
				
			||||||
        pkuseg_features_b = b""
 | 
					        pkuseg_features_b = b""
 | 
				
			||||||
        pkuseg_weights_b = b""
 | 
					        pkuseg_weights_b = b""
 | 
				
			||||||
| 
						 | 
					@ -165,17 +174,11 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
                sorted(list(self.pkuseg_seg.postprocesser.common_words)),
 | 
					                sorted(list(self.pkuseg_seg.postprocesser.common_words)),
 | 
				
			||||||
                sorted(list(self.pkuseg_seg.postprocesser.other_words)),
 | 
					                sorted(list(self.pkuseg_seg.postprocesser.other_words)),
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        serializers = OrderedDict(
 | 
					        serializers = {
 | 
				
			||||||
            (
 | 
					            "pkuseg_features": lambda: pkuseg_features_b,
 | 
				
			||||||
                ("cfg", lambda: srsly.json_dumps(self._get_config())),
 | 
					            "pkuseg_weights": lambda: pkuseg_weights_b,
 | 
				
			||||||
                ("pkuseg_features", lambda: pkuseg_features_b),
 | 
					            "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
 | 
				
			||||||
                ("pkuseg_weights", lambda: pkuseg_weights_b),
 | 
					        }
 | 
				
			||||||
                (
 | 
					 | 
				
			||||||
                    "pkuseg_processors",
 | 
					 | 
				
			||||||
                    lambda: srsly.msgpack_dumps(pkuseg_processors_data),
 | 
					 | 
				
			||||||
                ),
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        return util.to_bytes(serializers, [])
 | 
					        return util.to_bytes(serializers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, data, **kwargs):
 | 
					    def from_bytes(self, data, **kwargs):
 | 
				
			||||||
| 
						 | 
					@ -190,14 +193,11 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
        def deserialize_pkuseg_processors(b):
 | 
					        def deserialize_pkuseg_processors(b):
 | 
				
			||||||
            pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
 | 
					            pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        deserializers = OrderedDict(
 | 
					        deserializers = {
 | 
				
			||||||
            (
 | 
					            "pkuseg_features": deserialize_pkuseg_features,
 | 
				
			||||||
                ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
 | 
					            "pkuseg_weights": deserialize_pkuseg_weights,
 | 
				
			||||||
                ("pkuseg_features", deserialize_pkuseg_features),
 | 
					            "pkuseg_processors": deserialize_pkuseg_processors,
 | 
				
			||||||
                ("pkuseg_weights", deserialize_pkuseg_weights),
 | 
					        }
 | 
				
			||||||
                ("pkuseg_processors", deserialize_pkuseg_processors),
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        util.from_bytes(data, deserializers, [])
 | 
					        util.from_bytes(data, deserializers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
 | 
					        if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
 | 
				
			||||||
| 
						 | 
					@ -245,13 +245,10 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                srsly.write_msgpack(path, data)
 | 
					                srsly.write_msgpack(path, data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        serializers = OrderedDict(
 | 
					        serializers = {
 | 
				
			||||||
            (
 | 
					            "pkuseg_model": lambda p: save_pkuseg_model(p),
 | 
				
			||||||
                ("cfg", lambda p: srsly.write_json(p, self._get_config())),
 | 
					            "pkuseg_processors": lambda p: save_pkuseg_processors(p),
 | 
				
			||||||
                ("pkuseg_model", lambda p: save_pkuseg_model(p)),
 | 
					        }
 | 
				
			||||||
                ("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        return util.to_disk(path, serializers, [])
 | 
					        return util.to_disk(path, serializers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_disk(self, path, **kwargs):
 | 
					    def from_disk(self, path, **kwargs):
 | 
				
			||||||
| 
						 | 
					@ -261,7 +258,7 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                import pkuseg
 | 
					                import pkuseg
 | 
				
			||||||
            except ImportError:
 | 
					            except ImportError:
 | 
				
			||||||
                if self.segmenter == "pkuseg":
 | 
					                if self.segmenter == Segmenter.pkuseg:
 | 
				
			||||||
                    raise ImportError(
 | 
					                    raise ImportError(
 | 
				
			||||||
                        "pkuseg not installed. To use this model, "
 | 
					                        "pkuseg not installed. To use this model, "
 | 
				
			||||||
                        + _PKUSEG_INSTALL_MSG
 | 
					                        + _PKUSEG_INSTALL_MSG
 | 
				
			||||||
| 
						 | 
					@ -273,9 +270,9 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                import pkuseg
 | 
					                import pkuseg
 | 
				
			||||||
            except ImportError:
 | 
					            except ImportError:
 | 
				
			||||||
                if self.segmenter == "pkuseg":
 | 
					                if self.segmenter == Segmenter.pkuseg:
 | 
				
			||||||
                    raise ImportError(self._pkuseg_install_msg)
 | 
					                    raise ImportError(self._pkuseg_install_msg)
 | 
				
			||||||
            if self.segmenter == "pkuseg":
 | 
					            if self.segmenter == Segmenter.pkuseg:
 | 
				
			||||||
                data = srsly.read_msgpack(path)
 | 
					                data = srsly.read_msgpack(path)
 | 
				
			||||||
                (user_dict, do_process, common_words, other_words) = data
 | 
					                (user_dict, do_process, common_words, other_words) = data
 | 
				
			||||||
                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
 | 
					                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
 | 
				
			||||||
| 
						 | 
					@ -283,35 +280,64 @@ class ChineseTokenizer(DummyTokenizer):
 | 
				
			||||||
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
 | 
					                self.pkuseg_seg.postprocesser.common_words = set(common_words)
 | 
				
			||||||
                self.pkuseg_seg.postprocesser.other_words = set(other_words)
 | 
					                self.pkuseg_seg.postprocesser.other_words = set(other_words)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        serializers = OrderedDict(
 | 
					        serializers = {
 | 
				
			||||||
            (
 | 
					            "pkuseg_model": lambda p: load_pkuseg_model(p),
 | 
				
			||||||
                ("cfg", lambda p: self._set_config(srsly.read_json(p))),
 | 
					            "pkuseg_processors": lambda p: load_pkuseg_processors(p),
 | 
				
			||||||
                ("pkuseg_model", lambda p: load_pkuseg_model(p)),
 | 
					        }
 | 
				
			||||||
                ("pkuseg_processors", lambda p: load_pkuseg_processors(p)),
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        util.from_disk(path, serializers, [])
 | 
					        util.from_disk(path, serializers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ChineseDefaults(Language.Defaults):
 | 
					class ChineseDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					 | 
				
			||||||
    lex_attr_getters.update(LEX_ATTRS)
 | 
					 | 
				
			||||||
    lex_attr_getters[LANG] = lambda text: "zh"
 | 
					 | 
				
			||||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
					    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					 | 
				
			||||||
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @classmethod
 | 
					 | 
				
			||||||
    def create_tokenizer(cls, nlp=None, config={}):
 | 
					 | 
				
			||||||
        return ChineseTokenizer(cls, nlp, config=config)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Chinese(Language):
 | 
					class Chinese(Language):
 | 
				
			||||||
    lang = "zh"
 | 
					    lang = "zh"
 | 
				
			||||||
    Defaults = ChineseDefaults  # override defaults
 | 
					    Defaults = ChineseDefaults
 | 
				
			||||||
 | 
					    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def make_doc(self, text):
 | 
					
 | 
				
			||||||
        return self.tokenizer(text)
 | 
					def try_jieba_import(segmenter: str) -> None:
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        import jieba
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if segmenter == Segmenter.jieba:
 | 
				
			||||||
 | 
					            # segment a short text to have jieba initialize its cache in advance
 | 
				
			||||||
 | 
					            list(jieba.cut("作为", cut_all=False))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return jieba
 | 
				
			||||||
 | 
					    except ImportError:
 | 
				
			||||||
 | 
					        if segmenter == Segmenter.jieba:
 | 
				
			||||||
 | 
					            msg = (
 | 
				
			||||||
 | 
					                "Jieba not installed. To use jieba, install it with `pip "
 | 
				
			||||||
 | 
					                " install jieba` or from https://github.com/fxsjy/jieba"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            raise ImportError(msg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        import pkuseg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if pkuseg_model:
 | 
				
			||||||
 | 
					            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
 | 
				
			||||||
 | 
					        elif segmenter == Segmenter.pkuseg:
 | 
				
			||||||
 | 
					            msg = (
 | 
				
			||||||
 | 
					                "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
 | 
				
			||||||
 | 
					                "was specified. Please provide the name of a pretrained model "
 | 
				
			||||||
 | 
					                "or the path to a model with:\n"
 | 
				
			||||||
 | 
					                'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
 | 
				
			||||||
 | 
					                "nlp = Chinese.from_config(cfg)"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            raise ValueError(msg)
 | 
				
			||||||
 | 
					    except ImportError:
 | 
				
			||||||
 | 
					        if segmenter == Segmenter.pkuseg:
 | 
				
			||||||
 | 
					            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
 | 
				
			||||||
 | 
					            raise ImportError(msg)
 | 
				
			||||||
 | 
					    except FileNotFoundError:
 | 
				
			||||||
 | 
					        if segmenter == Segmenter.pkuseg:
 | 
				
			||||||
 | 
					            msg = "Unable to load pkuseg model from: " + pkuseg_model
 | 
				
			||||||
 | 
					            raise FileNotFoundError(msg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_pkuseg_trie_data(node, path=""):
 | 
					def _get_pkuseg_trie_data(node, path=""):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										1046
									
								
								spacy/language.py
									
									
									
									
									
								
							
							
						
						
									
										1046
									
								
								spacy/language.py
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| 
						 | 
					@ -1,5 +1,14 @@
 | 
				
			||||||
 | 
					from typing import Optional, Callable, List, Dict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .lookups import Lookups
 | 
				
			||||||
from .errors import Errors
 | 
					from .errors import Errors
 | 
				
			||||||
from .parts_of_speech import NAMES as UPOS_NAMES
 | 
					from .parts_of_speech import NAMES as UPOS_NAMES
 | 
				
			||||||
 | 
					from .util import registry, load_language_data, SimpleFrozenDict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.lemmatizers("spacy.Lemmatizer.v1")
 | 
				
			||||||
 | 
					def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
 | 
				
			||||||
 | 
					    return Lemmatizer(data_paths=data_paths)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Lemmatizer:
 | 
					class Lemmatizer:
 | 
				
			||||||
| 
						 | 
					@ -14,17 +23,27 @@ class Lemmatizer:
 | 
				
			||||||
    def load(cls, *args, **kwargs):
 | 
					    def load(cls, *args, **kwargs):
 | 
				
			||||||
        raise NotImplementedError(Errors.E172)
 | 
					        raise NotImplementedError(Errors.E172)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, lookups, is_base_form=None):
 | 
					    def __init__(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        lookups: Optional[Lookups] = None,
 | 
				
			||||||
 | 
					        data_paths: dict = SimpleFrozenDict(),
 | 
				
			||||||
 | 
					        is_base_form: Optional[Callable] = None,
 | 
				
			||||||
 | 
					    ) -> None:
 | 
				
			||||||
        """Initialize a Lemmatizer.
 | 
					        """Initialize a Lemmatizer.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        lookups (Lookups): The lookups object containing the (optional) tables
 | 
					        lookups (Lookups): The lookups object containing the (optional) tables
 | 
				
			||||||
            "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
 | 
					            "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
 | 
				
			||||||
        RETURNS (Lemmatizer): The newly constructed object.
 | 
					        RETURNS (Lemmatizer): The newly constructed object.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        self.lookups = lookups
 | 
					        self.lookups = lookups if lookups is not None else Lookups()
 | 
				
			||||||
 | 
					        for name, filename in data_paths.items():
 | 
				
			||||||
 | 
					            data = load_language_data(filename)
 | 
				
			||||||
 | 
					            self.lookups.add_table(name, data)
 | 
				
			||||||
        self.is_base_form = is_base_form
 | 
					        self.is_base_form = is_base_form
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
					    def __call__(
 | 
				
			||||||
 | 
					        self, string: str, univ_pos: str, morphology: Optional[dict] = None
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        """Lemmatize a string.
 | 
					        """Lemmatize a string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        string (str): The string to lemmatize, e.g. the token text.
 | 
					        string (str): The string to lemmatize, e.g. the token text.
 | 
				
			||||||
| 
						 | 
					@ -39,7 +58,6 @@ class Lemmatizer:
 | 
				
			||||||
        if isinstance(univ_pos, int):
 | 
					        if isinstance(univ_pos, int):
 | 
				
			||||||
            univ_pos = UPOS_NAMES.get(univ_pos, "X")
 | 
					            univ_pos = UPOS_NAMES.get(univ_pos, "X")
 | 
				
			||||||
        univ_pos = univ_pos.lower()
 | 
					        univ_pos = univ_pos.lower()
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if univ_pos in ("", "eol", "space"):
 | 
					        if univ_pos in ("", "eol", "space"):
 | 
				
			||||||
            return [string.lower()]
 | 
					            return [string.lower()]
 | 
				
			||||||
        # See Issue #435 for example of where this logic is requied.
 | 
					        # See Issue #435 for example of where this logic is requied.
 | 
				
			||||||
| 
						 | 
					@ -67,65 +85,31 @@ class Lemmatizer:
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        return lemmas
 | 
					        return lemmas
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def is_base_form(self, univ_pos, morphology=None):
 | 
					    def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        Check whether we're dealing with an uninflected paradigm, so we can
 | 
					 | 
				
			||||||
        avoid lemmatization entirely.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        univ_pos (str / int): The token's universal part-of-speech tag.
 | 
					 | 
				
			||||||
        morphology (dict): The token's morphological features following the
 | 
					 | 
				
			||||||
            Universal Dependencies scheme.
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        if morphology is None:
 | 
					 | 
				
			||||||
            morphology = {}
 | 
					 | 
				
			||||||
        if univ_pos == "noun" and morphology.get("Number") == "sing":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
 | 
					 | 
				
			||||||
        # morphology
 | 
					 | 
				
			||||||
        elif univ_pos == "verb" and (
 | 
					 | 
				
			||||||
            morphology.get("VerbForm") == "fin"
 | 
					 | 
				
			||||||
            and morphology.get("Tense") == "pres"
 | 
					 | 
				
			||||||
            and morphology.get("Number") is None
 | 
					 | 
				
			||||||
        ):
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        elif morphology.get("VerbForm") == "inf":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        elif morphology.get("VerbForm") == "none":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        elif morphology.get("Degree") == "pos":
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def noun(self, string, morphology=None):
 | 
					 | 
				
			||||||
        return self(string, "noun", morphology)
 | 
					        return self(string, "noun", morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def verb(self, string, morphology=None):
 | 
					    def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
				
			||||||
        return self(string, "verb", morphology)
 | 
					        return self(string, "verb", morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def adj(self, string, morphology=None):
 | 
					    def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
				
			||||||
        return self(string, "adj", morphology)
 | 
					        return self(string, "adj", morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def det(self, string, morphology=None):
 | 
					    def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
				
			||||||
        return self(string, "det", morphology)
 | 
					        return self(string, "det", morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pron(self, string, morphology=None):
 | 
					    def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
				
			||||||
        return self(string, "pron", morphology)
 | 
					        return self(string, "pron", morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def adp(self, string, morphology=None):
 | 
					    def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
				
			||||||
        return self(string, "adp", morphology)
 | 
					        return self(string, "adp", morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def num(self, string, morphology=None):
 | 
					    def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
				
			||||||
        return self(string, "num", morphology)
 | 
					        return self(string, "num", morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def punct(self, string, morphology=None):
 | 
					    def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
 | 
				
			||||||
        return self(string, "punct", morphology)
 | 
					        return self(string, "punct", morphology)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lookup(self, string, orth=None):
 | 
					    def lookup(self, string: str, orth: Optional[int] = None) -> str:
 | 
				
			||||||
        """Look up a lemma in the table, if available. If no lemma is found,
 | 
					        """Look up a lemma in the table, if available. If no lemma is found,
 | 
				
			||||||
        the original string is returned.
 | 
					        the original string is returned.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -141,7 +125,13 @@ class Lemmatizer:
 | 
				
			||||||
            return lookup_table[key]
 | 
					            return lookup_table[key]
 | 
				
			||||||
        return string
 | 
					        return string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lemmatize(self, string, index, exceptions, rules):
 | 
					    def lemmatize(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        string: str,
 | 
				
			||||||
 | 
					        index: Dict[str, List[str]],
 | 
				
			||||||
 | 
					        exceptions: Dict[str, Dict[str, List[str]]],
 | 
				
			||||||
 | 
					        rules: Dict[str, List[List[str]]],
 | 
				
			||||||
 | 
					    ) -> List[str]:
 | 
				
			||||||
        orig = string
 | 
					        orig = string
 | 
				
			||||||
        string = string.lower()
 | 
					        string = string.lower()
 | 
				
			||||||
        forms = []
 | 
					        forms = []
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,32 @@
 | 
				
			||||||
 | 
					from typing import Dict, Any, List, Union, Optional
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from preshed.bloom import BloomFilter
 | 
					from preshed.bloom import BloomFilter
 | 
				
			||||||
from collections import OrderedDict
 | 
					from collections import OrderedDict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .errors import Errors
 | 
					from .errors import Errors
 | 
				
			||||||
from .util import SimpleFrozenDict, ensure_path
 | 
					from .util import SimpleFrozenDict, ensure_path, registry
 | 
				
			||||||
from .strings import get_string_id
 | 
					from .strings import get_string_id
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
UNSET = object()
 | 
					UNSET = object()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.language_data("spacy-lookups-data")
 | 
				
			||||||
 | 
					def get_lookups(lang: str) -> Dict[str, Any]:
 | 
				
			||||||
 | 
					    """Load the data from the spacy-lookups-data package for a given language,
 | 
				
			||||||
 | 
					    if available. Returns an empty dict if there's no data or if the package
 | 
				
			||||||
 | 
					    is not installed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    lang (str): The language code (corresponds to entry point exposed by
 | 
				
			||||||
 | 
					        the spacy-lookups-data package).
 | 
				
			||||||
 | 
					    RETURNS (Dict[str, Any]): The lookups, keyed by table name.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if lang in registry.lookups:
 | 
				
			||||||
 | 
					        return registry.lookups.get(lang)
 | 
				
			||||||
 | 
					    return {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Lookups:
 | 
					class Lookups:
 | 
				
			||||||
    """Container for large lookup tables and dictionaries, e.g. lemmatization
 | 
					    """Container for large lookup tables and dictionaries, e.g. lemmatization
 | 
				
			||||||
    data or tokenizer exception lists. Lookups are available via vocab.lookups,
 | 
					    data or tokenizer exception lists. Lookups are available via vocab.lookups,
 | 
				
			||||||
| 
						 | 
					@ -18,7 +35,7 @@ class Lookups:
 | 
				
			||||||
    via doc.vocab.lookups.
 | 
					    via doc.vocab.lookups.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self):
 | 
					    def __init__(self) -> None:
 | 
				
			||||||
        """Initialize the Lookups object.
 | 
					        """Initialize the Lookups object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        RETURNS (Lookups): The newly created object.
 | 
					        RETURNS (Lookups): The newly created object.
 | 
				
			||||||
| 
						 | 
					@ -27,7 +44,7 @@ class Lookups:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        self._tables = {}
 | 
					        self._tables = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __contains__(self, name):
 | 
					    def __contains__(self, name: str) -> bool:
 | 
				
			||||||
        """Check if the lookups contain a table of a given name. Delegates to
 | 
					        """Check if the lookups contain a table of a given name. Delegates to
 | 
				
			||||||
        Lookups.has_table.
 | 
					        Lookups.has_table.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -36,16 +53,16 @@ class Lookups:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return self.has_table(name)
 | 
					        return self.has_table(name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self) -> int:
 | 
				
			||||||
        """RETURNS (int): The number of tables in the lookups."""
 | 
					        """RETURNS (int): The number of tables in the lookups."""
 | 
				
			||||||
        return len(self._tables)
 | 
					        return len(self._tables)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def tables(self):
 | 
					    def tables(self) -> List[str]:
 | 
				
			||||||
        """RETURNS (list): Names of all tables in the lookups."""
 | 
					        """RETURNS (List[str]): Names of all tables in the lookups."""
 | 
				
			||||||
        return list(self._tables.keys())
 | 
					        return list(self._tables.keys())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add_table(self, name, data=SimpleFrozenDict()):
 | 
					    def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
 | 
				
			||||||
        """Add a new table to the lookups. Raises an error if the table exists.
 | 
					        """Add a new table to the lookups. Raises an error if the table exists.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        name (str): Unique name of table.
 | 
					        name (str): Unique name of table.
 | 
				
			||||||
| 
						 | 
					@ -60,12 +77,12 @@ class Lookups:
 | 
				
			||||||
        self._tables[name] = table
 | 
					        self._tables[name] = table
 | 
				
			||||||
        return table
 | 
					        return table
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_table(self, name, default=UNSET):
 | 
					    def get_table(self, name: str, default: Any = UNSET) -> "Table":
 | 
				
			||||||
        """Get a table. Raises an error if the table doesn't exist and no
 | 
					        """Get a table. Raises an error if the table doesn't exist and no
 | 
				
			||||||
        default value is provided.
 | 
					        default value is provided.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        name (str): Name of the table.
 | 
					        name (str): Name of the table.
 | 
				
			||||||
        default: Optional default value to return if table doesn't exist.
 | 
					        default (Any): Optional default value to return if table doesn't exist.
 | 
				
			||||||
        RETURNS (Table): The table.
 | 
					        RETURNS (Table): The table.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/lookups#get_table
 | 
					        DOCS: https://spacy.io/api/lookups#get_table
 | 
				
			||||||
| 
						 | 
					@ -76,7 +93,7 @@ class Lookups:
 | 
				
			||||||
            return default
 | 
					            return default
 | 
				
			||||||
        return self._tables[name]
 | 
					        return self._tables[name]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def remove_table(self, name):
 | 
					    def remove_table(self, name: str) -> "Table":
 | 
				
			||||||
        """Remove a table. Raises an error if the table doesn't exist.
 | 
					        """Remove a table. Raises an error if the table doesn't exist.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        name (str): Name of the table to remove.
 | 
					        name (str): Name of the table to remove.
 | 
				
			||||||
| 
						 | 
					@ -88,7 +105,7 @@ class Lookups:
 | 
				
			||||||
            raise KeyError(Errors.E159.format(name=name, tables=self.tables))
 | 
					            raise KeyError(Errors.E159.format(name=name, tables=self.tables))
 | 
				
			||||||
        return self._tables.pop(name)
 | 
					        return self._tables.pop(name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def has_table(self, name):
 | 
					    def has_table(self, name: str) -> bool:
 | 
				
			||||||
        """Check if the lookups contain a table of a given name.
 | 
					        """Check if the lookups contain a table of a given name.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        name (str): Name of the table.
 | 
					        name (str): Name of the table.
 | 
				
			||||||
| 
						 | 
					@ -98,7 +115,7 @@ class Lookups:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return name in self._tables
 | 
					        return name in self._tables
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_bytes(self, **kwargs):
 | 
					    def to_bytes(self, **kwargs) -> bytes:
 | 
				
			||||||
        """Serialize the lookups to a bytestring.
 | 
					        """Serialize the lookups to a bytestring.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        RETURNS (bytes): The serialized Lookups.
 | 
					        RETURNS (bytes): The serialized Lookups.
 | 
				
			||||||
| 
						 | 
					@ -107,7 +124,7 @@ class Lookups:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return srsly.msgpack_dumps(self._tables)
 | 
					        return srsly.msgpack_dumps(self._tables)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, bytes_data, **kwargs):
 | 
					    def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
 | 
				
			||||||
        """Load the lookups from a bytestring.
 | 
					        """Load the lookups from a bytestring.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        bytes_data (bytes): The data to load.
 | 
					        bytes_data (bytes): The data to load.
 | 
				
			||||||
| 
						 | 
					@ -120,7 +137,9 @@ class Lookups:
 | 
				
			||||||
            self._tables[key] = Table(key, value)
 | 
					            self._tables[key] = Table(key, value)
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_disk(self, path, filename="lookups.bin", **kwargs):
 | 
					    def to_disk(
 | 
				
			||||||
 | 
					        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
 | 
				
			||||||
 | 
					    ) -> None:
 | 
				
			||||||
        """Save the lookups to a directory as lookups.bin. Expects a path to a
 | 
					        """Save the lookups to a directory as lookups.bin. Expects a path to a
 | 
				
			||||||
        directory, which will be created if it doesn't exist.
 | 
					        directory, which will be created if it doesn't exist.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -136,7 +155,9 @@ class Lookups:
 | 
				
			||||||
            with filepath.open("wb") as file_:
 | 
					            with filepath.open("wb") as file_:
 | 
				
			||||||
                file_.write(self.to_bytes())
 | 
					                file_.write(self.to_bytes())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_disk(self, path, filename="lookups.bin", **kwargs):
 | 
					    def from_disk(
 | 
				
			||||||
 | 
					        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
 | 
				
			||||||
 | 
					    ) -> "Lookups":
 | 
				
			||||||
        """Load lookups from a directory containing a lookups.bin. Will skip
 | 
					        """Load lookups from a directory containing a lookups.bin. Will skip
 | 
				
			||||||
        loading if the file doesn't exist.
 | 
					        loading if the file doesn't exist.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -162,7 +183,7 @@ class Table(OrderedDict):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def from_dict(cls, data, name=None):
 | 
					    def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
 | 
				
			||||||
        """Initialize a new table from a dict.
 | 
					        """Initialize a new table from a dict.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        data (dict): The dictionary.
 | 
					        data (dict): The dictionary.
 | 
				
			||||||
| 
						 | 
					@ -175,7 +196,7 @@ class Table(OrderedDict):
 | 
				
			||||||
        self.update(data)
 | 
					        self.update(data)
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, name=None, data=None):
 | 
					    def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
 | 
				
			||||||
        """Initialize a new table.
 | 
					        """Initialize a new table.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        name (str): Optional table name for reference.
 | 
					        name (str): Optional table name for reference.
 | 
				
			||||||
| 
						 | 
					@ -193,7 +214,7 @@ class Table(OrderedDict):
 | 
				
			||||||
        if data:
 | 
					        if data:
 | 
				
			||||||
            self.update(data)
 | 
					            self.update(data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __setitem__(self, key, value):
 | 
					    def __setitem__(self, key: Union[str, int], value: Any) -> None:
 | 
				
			||||||
        """Set new key/value pair. String keys will be hashed.
 | 
					        """Set new key/value pair. String keys will be hashed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        key (str / int): The key to set.
 | 
					        key (str / int): The key to set.
 | 
				
			||||||
| 
						 | 
					@ -203,7 +224,7 @@ class Table(OrderedDict):
 | 
				
			||||||
        OrderedDict.__setitem__(self, key, value)
 | 
					        OrderedDict.__setitem__(self, key, value)
 | 
				
			||||||
        self.bloom.add(key)
 | 
					        self.bloom.add(key)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def set(self, key, value):
 | 
					    def set(self, key: Union[str, int], value: Any) -> None:
 | 
				
			||||||
        """Set new key/value pair. String keys will be hashed.
 | 
					        """Set new key/value pair. String keys will be hashed.
 | 
				
			||||||
        Same as table[key] = value.
 | 
					        Same as table[key] = value.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -212,7 +233,7 @@ class Table(OrderedDict):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        self[key] = value
 | 
					        self[key] = value
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __getitem__(self, key):
 | 
					    def __getitem__(self, key: Union[str, int]) -> Any:
 | 
				
			||||||
        """Get the value for a given key. String keys will be hashed.
 | 
					        """Get the value for a given key. String keys will be hashed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        key (str / int): The key to get.
 | 
					        key (str / int): The key to get.
 | 
				
			||||||
| 
						 | 
					@ -221,7 +242,7 @@ class Table(OrderedDict):
 | 
				
			||||||
        key = get_string_id(key)
 | 
					        key = get_string_id(key)
 | 
				
			||||||
        return OrderedDict.__getitem__(self, key)
 | 
					        return OrderedDict.__getitem__(self, key)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get(self, key, default=None):
 | 
					    def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
 | 
				
			||||||
        """Get the value for a given key. String keys will be hashed.
 | 
					        """Get the value for a given key. String keys will be hashed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        key (str / int): The key to get.
 | 
					        key (str / int): The key to get.
 | 
				
			||||||
| 
						 | 
					@ -231,7 +252,7 @@ class Table(OrderedDict):
 | 
				
			||||||
        key = get_string_id(key)
 | 
					        key = get_string_id(key)
 | 
				
			||||||
        return OrderedDict.get(self, key, default)
 | 
					        return OrderedDict.get(self, key, default)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __contains__(self, key):
 | 
					    def __contains__(self, key: Union[str, int]) -> bool:
 | 
				
			||||||
        """Check whether a key is in the table. String keys will be hashed.
 | 
					        """Check whether a key is in the table. String keys will be hashed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        key (str / int): The key to check.
 | 
					        key (str / int): The key to check.
 | 
				
			||||||
| 
						 | 
					@ -243,7 +264,7 @@ class Table(OrderedDict):
 | 
				
			||||||
            return False
 | 
					            return False
 | 
				
			||||||
        return OrderedDict.__contains__(self, key)
 | 
					        return OrderedDict.__contains__(self, key)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_bytes(self):
 | 
					    def to_bytes(self) -> bytes:
 | 
				
			||||||
        """Serialize table to a bytestring.
 | 
					        """Serialize table to a bytestring.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        RETURNS (bytes): The serialized table.
 | 
					        RETURNS (bytes): The serialized table.
 | 
				
			||||||
| 
						 | 
					@ -257,7 +278,7 @@ class Table(OrderedDict):
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        return srsly.msgpack_dumps(data)
 | 
					        return srsly.msgpack_dumps(data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, bytes_data):
 | 
					    def from_bytes(self, bytes_data: bytes) -> "Table":
 | 
				
			||||||
        """Load a table from a bytestring.
 | 
					        """Load a table from a bytestring.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        bytes_data (bytes): The data to load.
 | 
					        bytes_data (bytes): The data to load.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -26,8 +26,8 @@ def build_nel_encoder(tok2vec, nO=None):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.assets.register("spacy.KBFromFile.v1")
 | 
					@registry.assets.register("spacy.KBFromFile.v1")
 | 
				
			||||||
def load_kb(nlp_path, kb_path) -> KnowledgeBase:
 | 
					def load_kb(vocab_path, kb_path) -> KnowledgeBase:
 | 
				
			||||||
    vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
 | 
					    vocab = Vocab().from_disk(vocab_path)
 | 
				
			||||||
    kb = KnowledgeBase(vocab=vocab)
 | 
					    kb = KnowledgeBase(vocab=vocab)
 | 
				
			||||||
    kb.load_bulk(kb_path)
 | 
					    kb.load_bulk(kb_path)
 | 
				
			||||||
    return kb
 | 
					    return kb
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,30 +1,9 @@
 | 
				
			||||||
from thinc.api import (
 | 
					from typing import Optional
 | 
				
			||||||
    Model,
 | 
					from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 | 
				
			||||||
    reduce_mean,
 | 
					from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 | 
				
			||||||
    Linear,
 | 
					from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
 | 
				
			||||||
    list2ragged,
 | 
					from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
 | 
				
			||||||
    Logistic,
 | 
					from thinc.api import Relu, residual, expand_window, FeatureExtractor
 | 
				
			||||||
    ParametricAttention,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
from thinc.api import chain, concatenate, clone, Dropout
 | 
					 | 
				
			||||||
from thinc.api import (
 | 
					 | 
				
			||||||
    SparseLinear,
 | 
					 | 
				
			||||||
    Softmax,
 | 
					 | 
				
			||||||
    softmax_activation,
 | 
					 | 
				
			||||||
    Maxout,
 | 
					 | 
				
			||||||
    reduce_sum,
 | 
					 | 
				
			||||||
    Relu,
 | 
					 | 
				
			||||||
    residual,
 | 
					 | 
				
			||||||
    expand_window,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
from thinc.api import (
 | 
					 | 
				
			||||||
    HashEmbed,
 | 
					 | 
				
			||||||
    with_ragged,
 | 
					 | 
				
			||||||
    with_array,
 | 
					 | 
				
			||||||
    with_cpu,
 | 
					 | 
				
			||||||
    uniqued,
 | 
					 | 
				
			||||||
    FeatureExtractor,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..spacy_vectors import SpacyVectors
 | 
					from ..spacy_vectors import SpacyVectors
 | 
				
			||||||
from ... import util
 | 
					from ... import util
 | 
				
			||||||
| 
						 | 
					@ -34,7 +13,9 @@ from ..extract_ngrams import extract_ngrams
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
 | 
					@registry.architectures.register("spacy.TextCatCNN.v1")
 | 
				
			||||||
def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
 | 
					def build_simple_cnn_text_classifier(
 | 
				
			||||||
 | 
					    tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
 | 
				
			||||||
 | 
					) -> Model:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Build a simple CNN text classifier, given a token-to-vector model as inputs.
 | 
					    Build a simple CNN text classifier, given a token-to-vector model as inputs.
 | 
				
			||||||
    If exclusive_classes=True, a softmax non-linearity is applied, so that the
 | 
					    If exclusive_classes=True, a softmax non-linearity is applied, so that the
 | 
				
			||||||
| 
						 | 
					@ -90,13 +71,25 @@ def build_text_classifier(
 | 
				
			||||||
            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
 | 
					            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        prefix = HashEmbed(
 | 
					        prefix = HashEmbed(
 | 
				
			||||||
            nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
 | 
					            nO=width // 2,
 | 
				
			||||||
 | 
					            nV=embed_size,
 | 
				
			||||||
 | 
					            column=cols.index(PREFIX),
 | 
				
			||||||
 | 
					            dropout=dropout,
 | 
				
			||||||
 | 
					            seed=11,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        suffix = HashEmbed(
 | 
					        suffix = HashEmbed(
 | 
				
			||||||
            nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
 | 
					            nO=width // 2,
 | 
				
			||||||
 | 
					            nV=embed_size,
 | 
				
			||||||
 | 
					            column=cols.index(SUFFIX),
 | 
				
			||||||
 | 
					            dropout=dropout,
 | 
				
			||||||
 | 
					            seed=12,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        shape = HashEmbed(
 | 
					        shape = HashEmbed(
 | 
				
			||||||
            nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
 | 
					            nO=width // 2,
 | 
				
			||||||
 | 
					            nV=embed_size,
 | 
				
			||||||
 | 
					            column=cols.index(SHAPE),
 | 
				
			||||||
 | 
					            dropout=dropout,
 | 
				
			||||||
 | 
					            seed=13,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
 | 
					        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -11,8 +11,8 @@ from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.architectures.register("spacy.Tok2VecTensors.v1")
 | 
					@registry.architectures.register("spacy.Tok2VecTensors.v1")
 | 
				
			||||||
def tok2vec_tensors_v1(width):
 | 
					def tok2vec_tensors_v1(width, upstream="*"):
 | 
				
			||||||
    tok2vec = Tok2VecListener("tok2vec", width=width)
 | 
					    tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
 | 
				
			||||||
    return tok2vec
 | 
					    return tok2vec
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,30 +1,37 @@
 | 
				
			||||||
 | 
					from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokens import Doc, Token, Span
 | 
					from .tokens import Doc, Token, Span
 | 
				
			||||||
from .errors import Errors, Warnings
 | 
					from .errors import Errors, Warnings
 | 
				
			||||||
 | 
					from .util import dot_to_dict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if TYPE_CHECKING:
 | 
				
			||||||
 | 
					    # This lets us add type hints for mypy etc. without causing circular imports
 | 
				
			||||||
 | 
					    from .language import Language  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def analyze_pipes(pipeline, name, pipe, index, warn=True):
 | 
					def analyze_pipes(
 | 
				
			||||||
 | 
					    nlp: "Language", name: str, index: int, warn: bool = True
 | 
				
			||||||
 | 
					) -> List[str]:
 | 
				
			||||||
    """Analyze a pipeline component with respect to its position in the current
 | 
					    """Analyze a pipeline component with respect to its position in the current
 | 
				
			||||||
    pipeline and the other components. Will check whether requirements are
 | 
					    pipeline and the other components. Will check whether requirements are
 | 
				
			||||||
    fulfilled (e.g. if previous components assign the attributes).
 | 
					    fulfilled (e.g. if previous components assign the attributes).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
					    nlp (Language): The current nlp object.
 | 
				
			||||||
    name (str): The name of the pipeline component to analyze.
 | 
					    name (str): The name of the pipeline component to analyze.
 | 
				
			||||||
    pipe (callable): The pipeline component function to analyze.
 | 
					 | 
				
			||||||
    index (int): The index of the component in the pipeline.
 | 
					    index (int): The index of the component in the pipeline.
 | 
				
			||||||
    warn (bool): Show user warning if problem is found.
 | 
					    warn (bool): Show user warning if problem is found.
 | 
				
			||||||
    RETURNS (list): The problems found for the given pipeline component.
 | 
					    RETURNS (List[str]): The problems found for the given pipeline component.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    assert pipeline[index][0] == name
 | 
					    assert nlp.pipeline[index][0] == name
 | 
				
			||||||
    prev_pipes = pipeline[:index]
 | 
					    prev_pipes = nlp.pipeline[:index]
 | 
				
			||||||
    pipe_requires = getattr(pipe, "requires", [])
 | 
					    meta = nlp.get_pipe_meta(name)
 | 
				
			||||||
    requires = {annot: False for annot in pipe_requires}
 | 
					    requires = {annot: False for annot in meta.requires}
 | 
				
			||||||
    if requires:
 | 
					    if requires:
 | 
				
			||||||
        for prev_name, prev_pipe in prev_pipes:
 | 
					        for prev_name, prev_pipe in prev_pipes:
 | 
				
			||||||
            prev_assigns = getattr(prev_pipe, "assigns", [])
 | 
					            prev_meta = nlp.get_pipe_meta(prev_name)
 | 
				
			||||||
            for annot in prev_assigns:
 | 
					            for annot in prev_meta.assigns:
 | 
				
			||||||
                requires[annot] = True
 | 
					                requires[annot] = True
 | 
				
			||||||
    problems = []
 | 
					    problems = []
 | 
				
			||||||
    for annot, fulfilled in requires.items():
 | 
					    for annot, fulfilled in requires.items():
 | 
				
			||||||
| 
						 | 
					@ -35,46 +42,29 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
 | 
				
			||||||
    return problems
 | 
					    return problems
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def analyze_all_pipes(pipeline, warn=True):
 | 
					def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
 | 
				
			||||||
    """Analyze all pipes in the pipeline in order.
 | 
					    """Analyze all pipes in the pipeline in order.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
					    nlp (Language): The current nlp object.
 | 
				
			||||||
    warn (bool): Show user warning if problem is found.
 | 
					    warn (bool): Show user warning if problem is found.
 | 
				
			||||||
    RETURNS (dict): The problems found, keyed by component name.
 | 
					    RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    problems = {}
 | 
					    problems = {}
 | 
				
			||||||
    for i, (name, pipe) in enumerate(pipeline):
 | 
					    for i, name in enumerate(nlp.pipe_names):
 | 
				
			||||||
        problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
 | 
					        problems[name] = analyze_pipes(nlp, name, i, warn=warn)
 | 
				
			||||||
    return problems
 | 
					    return problems
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def dot_to_dict(values):
 | 
					def validate_attrs(values: Iterable[str]) -> Iterable[str]:
 | 
				
			||||||
    """Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
 | 
					 | 
				
			||||||
    become {"token": {"pos": True, "_": {"xyz": True }}}.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    values (iterable): The values to convert.
 | 
					 | 
				
			||||||
    RETURNS (dict): The converted values.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    result = {}
 | 
					 | 
				
			||||||
    for value in values:
 | 
					 | 
				
			||||||
        path = result
 | 
					 | 
				
			||||||
        parts = value.lower().split(".")
 | 
					 | 
				
			||||||
        for i, item in enumerate(parts):
 | 
					 | 
				
			||||||
            is_last = i == len(parts) - 1
 | 
					 | 
				
			||||||
            path = path.setdefault(item, True if is_last else {})
 | 
					 | 
				
			||||||
    return result
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def validate_attrs(values):
 | 
					 | 
				
			||||||
    """Validate component attributes provided to "assigns", "requires" etc.
 | 
					    """Validate component attributes provided to "assigns", "requires" etc.
 | 
				
			||||||
    Raises error for invalid attributes and formatting. Doesn't check if
 | 
					    Raises error for invalid attributes and formatting. Doesn't check if
 | 
				
			||||||
    custom extension attributes are registered, since this is something the
 | 
					    custom extension attributes are registered, since this is something the
 | 
				
			||||||
    user might want to do themselves later in the component.
 | 
					    user might want to do themselves later in the component.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    values (iterable): The string attributes to check, e.g. `["token.pos"]`.
 | 
					    values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`.
 | 
				
			||||||
    RETURNS (iterable): The checked attributes.
 | 
					    RETURNS (Iterable[str]): The checked attributes.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    data = dot_to_dict(values)
 | 
					    data = dot_to_dict({value: True for value in values})
 | 
				
			||||||
    objs = {"doc": Doc, "token": Token, "span": Span}
 | 
					    objs = {"doc": Doc, "token": Token, "span": Span}
 | 
				
			||||||
    for obj_key, attrs in data.items():
 | 
					    for obj_key, attrs in data.items():
 | 
				
			||||||
        if obj_key == "span":
 | 
					        if obj_key == "span":
 | 
				
			||||||
| 
						 | 
					@ -111,37 +101,40 @@ def validate_attrs(values):
 | 
				
			||||||
    return values
 | 
					    return values
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_feature_for_attr(pipeline, attr, feature):
 | 
					def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
 | 
				
			||||||
    assert feature in ["assigns", "requires"]
 | 
					    assert feature in ["assigns", "requires"]
 | 
				
			||||||
    result = []
 | 
					    result = []
 | 
				
			||||||
    for pipe_name, pipe in pipeline:
 | 
					    for pipe_name in nlp.pipe_names:
 | 
				
			||||||
        pipe_assigns = getattr(pipe, feature, [])
 | 
					        meta = nlp.get_pipe_meta(pipe_name)
 | 
				
			||||||
 | 
					        pipe_assigns = getattr(meta, feature, [])
 | 
				
			||||||
        if attr in pipe_assigns:
 | 
					        if attr in pipe_assigns:
 | 
				
			||||||
            result.append((pipe_name, pipe))
 | 
					            result.append(pipe_name)
 | 
				
			||||||
    return result
 | 
					    return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_assigns_for_attr(pipeline, attr):
 | 
					def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
 | 
				
			||||||
    """Get all pipeline components that assign an attr, e.g. "doc.tensor".
 | 
					    """Get all pipeline components that assign an attr, e.g. "doc.tensor".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
					    pipeline (Language): The current nlp object.
 | 
				
			||||||
    attr (str): The attribute to check.
 | 
					    attr (str): The attribute to check.
 | 
				
			||||||
    RETURNS (list): (name, pipeline) tuples of components that assign the attr.
 | 
					    RETURNS (List[str]): Names of components that require the attr.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    return _get_feature_for_attr(pipeline, attr, "assigns")
 | 
					    return _get_feature_for_attr(nlp, attr, "assigns")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_requires_for_attr(pipeline, attr):
 | 
					def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
 | 
				
			||||||
    """Get all pipeline components that require an attr, e.g. "doc.tensor".
 | 
					    """Get all pipeline components that require an attr, e.g. "doc.tensor".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
					    pipeline (Language): The current nlp object.
 | 
				
			||||||
    attr (str): The attribute to check.
 | 
					    attr (str): The attribute to check.
 | 
				
			||||||
    RETURNS (list): (name, pipeline) tuples of components that require the attr.
 | 
					    RETURNS (List[str]): Names of components that require the attr.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    return _get_feature_for_attr(pipeline, attr, "requires")
 | 
					    return _get_feature_for_attr(nlp, attr, "requires")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def print_summary(nlp, pretty=True, no_print=False):
 | 
					def print_summary(
 | 
				
			||||||
 | 
					    nlp: "Language", pretty: bool = True, no_print: bool = False
 | 
				
			||||||
 | 
					) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
 | 
				
			||||||
    """Print a formatted summary for the current nlp object's pipeline. Shows
 | 
					    """Print a formatted summary for the current nlp object's pipeline. Shows
 | 
				
			||||||
    a table with the pipeline components and why they assign and require, as
 | 
					    a table with the pipeline components and why they assign and require, as
 | 
				
			||||||
    well as any problems if available.
 | 
					    well as any problems if available.
 | 
				
			||||||
| 
						 | 
					@ -154,12 +147,10 @@ def print_summary(nlp, pretty=True, no_print=False):
 | 
				
			||||||
    msg = Printer(pretty=pretty, no_print=no_print)
 | 
					    msg = Printer(pretty=pretty, no_print=no_print)
 | 
				
			||||||
    overview = []
 | 
					    overview = []
 | 
				
			||||||
    problems = {}
 | 
					    problems = {}
 | 
				
			||||||
    for i, (name, pipe) in enumerate(nlp.pipeline):
 | 
					    for i, name in enumerate(nlp.pipe_names):
 | 
				
			||||||
        requires = getattr(pipe, "requires", [])
 | 
					        meta = nlp.get_pipe_meta(name)
 | 
				
			||||||
        assigns = getattr(pipe, "assigns", [])
 | 
					        overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
 | 
				
			||||||
        retok = getattr(pipe, "retokenizes", False)
 | 
					        problems[name] = analyze_pipes(nlp, name, i, warn=False)
 | 
				
			||||||
        overview.append((i, name, requires, assigns, retok))
 | 
					 | 
				
			||||||
        problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
 | 
					 | 
				
			||||||
    msg.divider("Pipeline Overview")
 | 
					    msg.divider("Pipeline Overview")
 | 
				
			||||||
    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
 | 
					    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
 | 
				
			||||||
    msg.table(overview, header=header, divider=True, multiline=True)
 | 
					    msg.table(overview, header=header, divider=True, multiline=True)
 | 
				
			||||||
| 
						 | 
					@ -175,15 +166,19 @@ def print_summary(nlp, pretty=True, no_print=False):
 | 
				
			||||||
        return {"overview": overview, "problems": problems}
 | 
					        return {"overview": overview, "problems": problems}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def count_pipeline_interdependencies(pipeline):
 | 
					def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
 | 
				
			||||||
    """Count how many subsequent components require an annotation set by each
 | 
					    """Count how many subsequent components require an annotation set by each
 | 
				
			||||||
    component in the pipeline.
 | 
					    component in the pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    nlp (Language): The current nlp object.
 | 
				
			||||||
 | 
					    RETURNS (List[int]): The interdependency counts.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    pipe_assigns = []
 | 
					    pipe_assigns = []
 | 
				
			||||||
    pipe_requires = []
 | 
					    pipe_requires = []
 | 
				
			||||||
    for name, pipe in pipeline:
 | 
					    for name in nlp.pipe_names:
 | 
				
			||||||
        pipe_assigns.append(set(getattr(pipe, "assigns", [])))
 | 
					        meta = nlp.get_pipe_meta(name)
 | 
				
			||||||
        pipe_requires.append(set(getattr(pipe, "requires", [])))
 | 
					        pipe_assigns.append(set(meta.assigns))
 | 
				
			||||||
 | 
					        pipe_requires.append(set(meta.requires))
 | 
				
			||||||
    counts = []
 | 
					    counts = []
 | 
				
			||||||
    for i, assigns in enumerate(pipe_assigns):
 | 
					    for i, assigns in enumerate(pipe_assigns):
 | 
				
			||||||
        count = 0
 | 
					        count = 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,28 +1,33 @@
 | 
				
			||||||
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
 | 
					from .dep_parser import DependencyParser
 | 
				
			||||||
from .pipes import TextCategorizer, Pipe, Sentencizer
 | 
					from .entity_linker import EntityLinker
 | 
				
			||||||
from .pipes import SentenceRecognizer
 | 
					from .ner import EntityRecognizer
 | 
				
			||||||
from .simple_ner import SimpleNER
 | 
					 | 
				
			||||||
from .morphologizer import Morphologizer
 | 
					 | 
				
			||||||
from .entityruler import EntityRuler
 | 
					from .entityruler import EntityRuler
 | 
				
			||||||
 | 
					from .morphologizer import Morphologizer
 | 
				
			||||||
 | 
					from .pipe import Pipe
 | 
				
			||||||
 | 
					from spacy.pipeline.senter import SentenceRecognizer
 | 
				
			||||||
 | 
					from .sentencizer import Sentencizer
 | 
				
			||||||
 | 
					from .simple_ner import SimpleNER
 | 
				
			||||||
 | 
					from .tagger import Tagger
 | 
				
			||||||
 | 
					from .textcat import TextCategorizer
 | 
				
			||||||
from .tok2vec import Tok2Vec
 | 
					from .tok2vec import Tok2Vec
 | 
				
			||||||
from .hooks import SentenceSegmenter, SimilarityHook
 | 
					from .hooks import SentenceSegmenter, SimilarityHook
 | 
				
			||||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
 | 
					from .functions import merge_entities, merge_noun_chunks, merge_subtokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = [
 | 
					__all__ = [
 | 
				
			||||||
    "Tagger",
 | 
					 | 
				
			||||||
    "DependencyParser",
 | 
					    "DependencyParser",
 | 
				
			||||||
    "EntityRecognizer",
 | 
					 | 
				
			||||||
    "EntityLinker",
 | 
					    "EntityLinker",
 | 
				
			||||||
    "TextCategorizer",
 | 
					    "EntityRecognizer",
 | 
				
			||||||
    "Tok2Vec",
 | 
					 | 
				
			||||||
    "Pipe",
 | 
					 | 
				
			||||||
    "Morphologizer",
 | 
					 | 
				
			||||||
    "EntityRuler",
 | 
					    "EntityRuler",
 | 
				
			||||||
    "Sentencizer",
 | 
					    "Morphologizer",
 | 
				
			||||||
    "SentenceSegmenter",
 | 
					    "Pipe",
 | 
				
			||||||
    "SentenceRecognizer",
 | 
					    "SentenceRecognizer",
 | 
				
			||||||
 | 
					    "SentenceSegmenter",
 | 
				
			||||||
 | 
					    "Sentencizer",
 | 
				
			||||||
    "SimilarityHook",
 | 
					    "SimilarityHook",
 | 
				
			||||||
    "SimpleNER",
 | 
					    "SimpleNER",
 | 
				
			||||||
 | 
					    "Tagger",
 | 
				
			||||||
 | 
					    "TextCategorizer",
 | 
				
			||||||
 | 
					    "Tok2Vec",
 | 
				
			||||||
    "merge_entities",
 | 
					    "merge_entities",
 | 
				
			||||||
    "merge_noun_chunks",
 | 
					    "merge_noun_chunks",
 | 
				
			||||||
    "merge_subtokens",
 | 
					    "merge_subtokens",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,93 +0,0 @@
 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ... import util
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_nel_config():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "entity_linker_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_nel():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "entity_linker_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_morphologizer_config():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "morphologizer_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_morphologizer():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "morphologizer_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_parser_config():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "parser_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_parser():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "parser_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_ner_config():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "ner_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_ner():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "ner_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_senter_config():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "senter_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_senter():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "senter_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_tagger_config():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "tagger_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_tagger():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "tagger_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_textcat_config():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "textcat_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_textcat():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "textcat_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_tok2vec_config():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "tok2vec_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_tok2vec():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "tok2vec_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_simple_ner_config():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "simple_ner_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def default_simple_ner():
 | 
					 | 
				
			||||||
    loc = Path(__file__).parent / "simple_ner_defaults.cfg"
 | 
					 | 
				
			||||||
    return util.load_config(loc, create_objects=True)["model"]
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,13 +0,0 @@
 | 
				
			||||||
[model]
 | 
					 | 
				
			||||||
@architectures = "spacy.EntityLinker.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[model.tok2vec]
 | 
					 | 
				
			||||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
					 | 
				
			||||||
pretrained_vectors = null
 | 
					 | 
				
			||||||
width = 96
 | 
					 | 
				
			||||||
depth = 2
 | 
					 | 
				
			||||||
embed_size = 300
 | 
					 | 
				
			||||||
window_size = 1
 | 
					 | 
				
			||||||
maxout_pieces = 3
 | 
					 | 
				
			||||||
subword_features = true
 | 
					 | 
				
			||||||
dropout = null
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,14 +0,0 @@
 | 
				
			||||||
[model]
 | 
					 | 
				
			||||||
@architectures = "spacy.Tagger.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[model.tok2vec]
 | 
					 | 
				
			||||||
@architectures = "spacy.HashCharEmbedCNN.v1"
 | 
					 | 
				
			||||||
pretrained_vectors = null
 | 
					 | 
				
			||||||
width = 128
 | 
					 | 
				
			||||||
depth = 4
 | 
					 | 
				
			||||||
embed_size = 7000
 | 
					 | 
				
			||||||
window_size = 1
 | 
					 | 
				
			||||||
maxout_pieces = 3
 | 
					 | 
				
			||||||
nM = 64
 | 
					 | 
				
			||||||
nC = 8
 | 
					 | 
				
			||||||
dropout = null
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,15 +0,0 @@
 | 
				
			||||||
[model]
 | 
					 | 
				
			||||||
@architectures = "spacy.MultiTask.v1"
 | 
					 | 
				
			||||||
maxout_pieces = 3
 | 
					 | 
				
			||||||
token_vector_width = 96
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[model.tok2vec]
 | 
					 | 
				
			||||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
					 | 
				
			||||||
pretrained_vectors = null
 | 
					 | 
				
			||||||
width = 96
 | 
					 | 
				
			||||||
depth = 4
 | 
					 | 
				
			||||||
embed_size = 2000
 | 
					 | 
				
			||||||
window_size = 1
 | 
					 | 
				
			||||||
maxout_pieces = 2
 | 
					 | 
				
			||||||
subword_features = true
 | 
					 | 
				
			||||||
dropout = null
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,16 +0,0 @@
 | 
				
			||||||
[model]
 | 
					 | 
				
			||||||
@architectures = "spacy.TransitionBasedParser.v1"
 | 
					 | 
				
			||||||
nr_feature_tokens = 6
 | 
					 | 
				
			||||||
hidden_width = 64
 | 
					 | 
				
			||||||
maxout_pieces = 2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[model.tok2vec]
 | 
					 | 
				
			||||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
					 | 
				
			||||||
pretrained_vectors = null
 | 
					 | 
				
			||||||
width = 96
 | 
					 | 
				
			||||||
depth = 4
 | 
					 | 
				
			||||||
embed_size = 2000
 | 
					 | 
				
			||||||
window_size = 1
 | 
					 | 
				
			||||||
maxout_pieces = 3
 | 
					 | 
				
			||||||
subword_features = true
 | 
					 | 
				
			||||||
dropout = null
 | 
					 | 
				
			||||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user