diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index ac5987aa4..11ad564ec 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -16,7 +16,7 @@ from bin.ud import conll17_ud_eval from spacy.tokens import Token, Doc from spacy.gold import Example from spacy.util import compounding, minibatch, minibatch_by_words -from spacy.syntax.nonproj import projectivize +from spacy.pipeline._parser_internals.nonproj import projectivize from spacy.matcher import Matcher from spacy import displacy from collections import defaultdict diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index 0e0d4d4c3..3ab3ddaba 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -1,37 +1,46 @@ -# Training hyper-parameters and additional features. -[training] -# Whether to train on sequences with 'gold standard' sentence boundaries -# and tokens. If you set this to true, take care to ensure your run-time -# data is passed in sentence-by-sentence via some prior preprocessing. -gold_preproc = false -# Limitations on training document length or number of examples. -max_length = 5000 -limit = 0 -# Data augmentation -orth_variant_level = 0.0 -dropout = 0.1 -# Controls early-stopping. 0 or -1 mean unlimited. -patience = 1600 -max_epochs = 0 -max_steps = 20000 -eval_frequency = 200 -# Other settings -seed = 0 -accumulate_gradient = 1 -use_pytorch_for_gpu_memory = false -# Control how scores are printed and checkpoints are evaluated. -eval_batch_size = 128 -score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2} +[paths] +train = "" +dev = "" +raw = null init_tok2vec = null -discard_oversize = false -batch_by = "words" -raw_text = null -tag_map = null -vectors = null -base_model = null -morph_rules = null -[training.batch_size] +[system] +seed = 0 +use_pytorch_for_gpu_memory = false + +[training] +seed = ${system:seed} +dropout = 0.1 +init_tok2vec = ${paths:init_tok2vec} +vectors = null +accumulate_gradient = 1 +max_steps = 0 +max_epochs = 0 +patience = 10000 +eval_frequency = 200 +score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2} +frozen_components = [] + +[training.train_corpus] +@readers = "spacy.Corpus.v1" +path = ${paths:train} +gold_preproc = true +max_length = 0 +limit = 0 + +[training.dev_corpus] +@readers = "spacy.Corpus.v1" +path = ${paths:dev} +gold_preproc = ${training.read_train:gold_preproc} +max_length = 0 +limit = 0 + +[training.batcher] +@batchers = "batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 + +[training.batcher.size] @schedules = "compounding.v1" start = 100 stop = 1000 diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index eed76cb7b..fc471ac43 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -1,30 +1,45 @@ +[paths] +train = "" +dev = "" +raw = null +init_tok2vec = null + +[system] +seed = 0 +use_pytorch_for_gpu_memory = false + [training] +seed = ${system:seed} +dropout = 0.2 +init_tok2vec = ${paths:init_tok2vec} +vectors = null +accumulate_gradient = 1 max_steps = 0 +max_epochs = 0 patience = 10000 eval_frequency = 200 -dropout = 0.2 -init_tok2vec = null -vectors = null -max_epochs = 100 -orth_variant_level = 0.0 +score_weights = {"dep_las": 0.8, "tag_acc": 0.2} + +[training.read_train] +@readers = "spacy.Corpus.v1" +path = ${paths:train} gold_preproc = true max_length = 0 -scores = ["tag_acc", "dep_uas", "dep_las", "speed"] -score_weights = {"dep_las": 0.8, "tag_acc": 0.2} limit = 0 -seed = 0 -accumulate_gradient = 1 + +[training.read_dev] +@readers = "spacy.Corpus.v1" +path = ${paths:dev} +gold_preproc = ${training.read_train:gold_preproc} +max_length = 0 +limit = 0 + +[training.batcher] +@batchers = "batch_by_words.v1" discard_oversize = false -raw_text = null -tag_map = null -morph_rules = null -base_model = null +tolerance = 0.2 -eval_batch_size = 128 -use_pytorch_for_gpu_memory = false -batch_by = "words" - -[training.batch_size] +[training.batcher.size] @schedules = "compounding.v1" start = 100 stop = 1000 diff --git a/examples/training/conllu.py b/examples/training/conllu.py index ecc07ccf2..a398b0ae0 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -13,7 +13,7 @@ import spacy import spacy.util from spacy.tokens import Token, Doc from spacy.gold import Example -from spacy.syntax.nonproj import projectivize +from spacy.pipeline._parser_internals.nonproj import projectivize from collections import defaultdict from spacy.matcher import Matcher diff --git a/examples/training/create_kb.py b/examples/training/create_kb.py index 5b17bb59e..0c6e29226 100644 --- a/examples/training/create_kb.py +++ b/examples/training/create_kb.py @@ -48,7 +48,8 @@ def main(model, output_dir=None): # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality. # For simplicity, we'll just use the original vector dimension here instead. vectors_dim = nlp.vocab.vectors.shape[1] - kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim) + kb = KnowledgeBase(entity_vector_length=vectors_dim) + kb.initialize(nlp.vocab) # set up the data entity_ids = [] @@ -95,7 +96,8 @@ def main(model, output_dir=None): print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) - kb2 = KnowledgeBase(vocab=vocab2) + kb2 = KnowledgeBase(entity_vector_length=1) + kb.initialize(vocab2) kb2.load_bulk(kb_path) print() _print_kb(kb2) diff --git a/pyproject.toml b/pyproject.toml index 91f1464df..935b221d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a19,<8.0.0a30", + "thinc>=8.0.0a22,<8.0.0a30", "blis>=0.4.0,<0.5.0", "pytokenizations", "smart_open>=2.0.0,<3.0.0" diff --git a/requirements.txt b/requirements.txt index d0413825b..a082f4b6e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a19,<8.0.0a30 +thinc>=8.0.0a22,<8.0.0a30 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index d2cb7c92a..249dc9827 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a19,<8.0.0a30 + thinc>=8.0.0a22,<8.0.0a30 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a19,<8.0.0a30 + thinc>=8.0.0a22,<8.0.0a30 blis>=0.4.0,<0.5.0 wasabi>=0.7.1,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/setup.py b/setup.py index 6d962ab59..af4cd0ec6 100755 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ MOD_NAMES = [ "spacy.vocab", "spacy.attrs", "spacy.kb", + "spacy.ml.parser_model", "spacy.morphology", "spacy.pipeline.dep_parser", "spacy.pipeline.morphologizer", @@ -40,14 +41,14 @@ MOD_NAMES = [ "spacy.pipeline.sentencizer", "spacy.pipeline.senter", "spacy.pipeline.tagger", - "spacy.syntax.stateclass", - "spacy.syntax._state", + "spacy.pipeline.transition_parser", + "spacy.pipeline._parser_internals.arc_eager", + "spacy.pipeline._parser_internals.ner", + "spacy.pipeline._parser_internals.nonproj", + "spacy.pipeline._parser_internals._state", + "spacy.pipeline._parser_internals.stateclass", + "spacy.pipeline._parser_internals.transition_system", "spacy.tokenizer", - "spacy.syntax.nn_parser", - "spacy.syntax._parser_model", - "spacy.syntax.nonproj", - "spacy.syntax.transition_system", - "spacy.syntax.arc_eager", "spacy.gold.gold_io", "spacy.tokens.doc", "spacy.tokens.span", @@ -57,7 +58,6 @@ MOD_NAMES = [ "spacy.matcher.matcher", "spacy.matcher.phrasematcher", "spacy.matcher.dependencymatcher", - "spacy.syntax.ner", "spacy.symbols", "spacy.vectors", ] diff --git a/spacy/__init__.py b/spacy/__init__.py index da2b23a20..73e828936 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -8,6 +8,7 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa # These are imported as part of the API from thinc.api import prefer_gpu, require_gpu # noqa: F401 +from thinc.api import Config from . import pipeline # noqa: F401 from .cli.info import info # noqa: F401 @@ -26,17 +27,17 @@ if sys.maxunicode == 65535: def load( name: Union[str, Path], disable: Iterable[str] = tuple(), - component_cfg: Dict[str, Dict[str, Any]] = util.SimpleFrozenDict(), + config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), ) -> Language: """Load a spaCy model from an installed package or a local path. name (str): Package name or model path. disable (Iterable[str]): Names of pipeline components to disable. - component_cfg (Dict[str, dict]): Config overrides for pipeline components, - keyed by component names. + config (Dict[str, Any] / Config): Config overrides as nested dict or dict + keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. """ - return util.load_model(name, disable=disable, component_cfg=component_cfg) + return util.load_model(name, disable=disable, config=config) def blank(name: str, **overrides) -> Language: diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 72fac05a6..bc47ffdef 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,6 +15,7 @@ from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 +from .init_config import init_config # noqa: F401 from .validate import validate # noqa: F401 from .project.clone import project_clone # noqa: F401 from .project.assets import project_assets # noqa: F401 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index f277988f8..93ec9f31e 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -6,7 +6,7 @@ import hashlib import typer from typer.main import get_command from contextlib import contextmanager -from thinc.config import ConfigValidationError +from thinc.config import Config, ConfigValidationError from configparser import InterpolationError import sys @@ -31,6 +31,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes commands to check and validate your config files, training and evaluation data, and custom model implementations. """ +INIT_HELP = """Commands for initializing configs and models.""" # Wrappers for Typer's annotations. Initially created to set defaults and to # keep the names short, but not needed at the moment. @@ -40,9 +41,11 @@ Opt = typer.Option app = typer.Typer(name=NAME, help=HELP) project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True) debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True) +init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True) app.add_typer(project_cli) app.add_typer(debug_cli) +app.add_typer(init_cli) def setup_cli() -> None: @@ -172,16 +175,34 @@ def get_checksum(path: Union[Path, str]) -> str: @contextmanager -def show_validation_error(title: str = "Config validation error"): +def show_validation_error( + file_path: Optional[Union[str, Path]] = None, + *, + title: str = "Config validation error", + hint_init: bool = True, +): """Helper to show custom config validation errors on the CLI. + file_path (str / Path): Optional file path of config file, used in hints. title (str): Title of the custom formatted error. + hint_init (bool): Show hint about filling config. """ try: yield except (ConfigValidationError, InterpolationError) as e: msg.fail(title, spaced=True) - print(str(e).replace("Config validation error", "").strip()) + # TODO: This is kinda hacky and we should probably provide a better + # helper for this in Thinc + err_text = str(e).replace("Config validation error", "").strip() + print(err_text) + if hint_init and "field required" in err_text: + config_path = file_path if file_path is not None else "config.cfg" + msg.text( + "If your config contains missing values, you can run the 'init " + "config' command to fill in all the defaults, if possible:", + spaced=True, + ) + print(f"{COMMAND} init config {config_path} --base {config_path}\n") sys.exit(1) @@ -196,3 +217,15 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None: import_file("python_code", code_path) except Exception as e: msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) + + +def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: + """RETURNS (List[str]): All sourced components in the original config, + e.g. {"source": "en_core_web_sm"}. If the config contains a key + "factory", we assume it refers to a component factory. + """ + return [ + name + for name, cfg in config.get("components", {}).items() + if "factory" not in cfg and "source" in cfg + ] diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 1ffceeca1..6c8c85e30 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -8,9 +8,9 @@ import typer from thinc.api import Config from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides -from ._util import import_code, debug_cli +from ._util import import_code, debug_cli, get_sourced_components from ..gold import Corpus, Example -from ..syntax import nonproj +from ..pipeline._parser_internals import nonproj from ..language import Language from .. import util @@ -33,7 +33,6 @@ def debug_config_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True), auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"), diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled") # fmt: on @@ -49,15 +48,12 @@ def debug_config_cli( """ overrides = parse_config_overrides(ctx.args) import_code(code_path) - with show_validation_error(): - config = Config().from_disk(config_path) + with show_validation_error(config_path): + config = Config().from_disk(config_path, overrides=overrides) try: - nlp, _ = util.load_model_from_config( - config, overrides=overrides, auto_fill=auto_fill - ) + nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill) except ValueError as e: msg.fail(str(e), exits=1) - is_stdout = output_path is not None and str(output_path) == "-" if auto_fill: orig_config = config.to_str() filled_config = nlp.config.to_str() @@ -68,12 +64,7 @@ def debug_config_cli( if diff: print(diff_strings(config.to_str(), nlp.config.to_str())) else: - msg.good("Original config is valid", show=not is_stdout) - if is_stdout: - print(nlp.config.to_str()) - elif output_path is not None: - nlp.config.to_disk(output_path) - msg.good(f"Saved updated config to {output_path}") + msg.good("Original config is valid") @debug_cli.command( @@ -142,12 +133,13 @@ def debug_data( msg.fail("Development data not found", dev_path, exits=1) if not config_path.exists(): msg.fail("Config file not found", config_path, exists=1) - with show_validation_error(): - cfg = Config().from_disk(config_path) - nlp, config = util.load_model_from_config(cfg, overrides=config_overrides) - # TODO: handle base model - lang = config["nlp"]["lang"] - base_model = config["training"]["base_model"] + with show_validation_error(config_path): + cfg = Config().from_disk(config_path, overrides=config_overrides) + nlp, config = util.load_model_from_config(cfg) + # Use original config here, not resolved version + sourced_components = get_sourced_components(cfg) + frozen_components = config["training"]["frozen_components"] + resume_components = [p for p in sourced_components if p not in frozen_components] pipeline = nlp.pipe_names factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] tag_map_path = util.ensure_path(config["training"]["tag_map"]) @@ -169,13 +161,12 @@ def debug_data( loading_train_error_message = "" loading_dev_error_message = "" with msg.loading("Loading corpus..."): - corpus = Corpus(train_path, dev_path) try: - train_dataset = list(corpus.train_dataset(nlp)) + train_dataset = list(Corpus(train_path)(nlp)) except ValueError as e: loading_train_error_message = f"Training data cannot be loaded: {e}" try: - dev_dataset = list(corpus.dev_dataset(nlp)) + dev_dataset = list(Corpus(dev_path)(nlp)) except ValueError as e: loading_dev_error_message = f"Development data cannot be loaded: {e}" if loading_train_error_message or loading_dev_error_message: @@ -195,13 +186,15 @@ def debug_data( train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] + frozen_components = config["training"]["frozen_components"] msg.divider("Training stats") + msg.text(f"Language: {config['nlp']['lang']}") msg.text(f"Training pipeline: {', '.join(pipeline)}") - if base_model: - msg.text(f"Starting with base model '{base_model}'") - else: - msg.text(f"Starting with blank model '{lang}'") + if resume_components: + msg.text(f"Components from other models: {', '.join(resume_components)}") + if frozen_components: + msg.text(f"Frozen components: {', '.join(frozen_components)}") msg.text(f"{len(train_dataset)} training docs") msg.text(f"{len(dev_dataset)} evaluation docs") @@ -212,7 +205,9 @@ def debug_data( msg.warn(f"{overlap} training examples also in evaluation data") else: msg.good("No overlap between training and evaluation data") - if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD: + # TODO: make this feedback more fine-grained and report on updated + # components vs. blank components + if not resume_components and len(train_dataset) < BLANK_MODEL_THRESHOLD: text = ( f"Low number of examples to train from a blank model ({len(train_dataset)})" ) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 88e060238..cc6cb98ea 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -2,13 +2,11 @@ from typing import Dict, Any, Optional from pathlib import Path from wasabi import msg from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config -from thinc.api import Model +from thinc.api import Model, data_validation import typer from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides from .. import util -from ..lang.en import English -from ..util import dot_to_object @debug_cli.command("model") @@ -16,7 +14,7 @@ def debug_model_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True), - section: str = Arg(..., help="Section that defines the model to be analysed"), + component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"), layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"), dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"), parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"), @@ -25,7 +23,7 @@ def debug_model_cli( P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"), P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"), P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"), - P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"), + P3: bool = Opt(False, "--print-step3", "-P3", help="Print final predictions"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): @@ -50,10 +48,10 @@ def debug_model_cli( "print_prediction": P3, } config_overrides = parse_config_overrides(ctx.args) - cfg = Config().from_disk(config_path) - with show_validation_error(): + with show_validation_error(config_path): + cfg = Config().from_disk(config_path, overrides=config_overrides) try: - _, config = util.load_model_from_config(cfg, overrides=config_overrides) + nlp, config = util.load_model_from_config(cfg) except ValueError as e: msg.fail(str(e), exits=1) seed = config["pretraining"]["seed"] @@ -61,12 +59,12 @@ def debug_model_cli( msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) - component = dot_to_object(config, section) - if hasattr(component, "model"): - model = component.model + pipe = nlp.get_pipe(component) + if hasattr(pipe, "model"): + model = pipe.model else: msg.fail( - f"The section '{section}' does not specify an object that holds a Model.", + f"The component '{component}' does not specify an object that holds a Model.", exits=1, ) debug_model(model, print_settings=print_settings) @@ -84,15 +82,17 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None # STEP 0: Printing before training msg.info(f"Analysing model with ID {model.id}") if print_settings.get("print_before_training"): - msg.info(f"Before training:") + msg.divider(f"STEP 0 - before training") _print_model(model, print_settings) # STEP 1: Initializing the model and printing again Y = _get_output(model.ops.xp) _set_output_dim(nO=Y.shape[-1], model=model) - model.initialize(X=_get_docs(), Y=Y) + # The output vector might differ from the official type of the output layer + with data_validation(False): + model.initialize(X=_get_docs(), Y=Y) if print_settings.get("print_after_init"): - msg.info(f"After initialization:") + msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) # STEP 2: Updating the model and printing again @@ -104,13 +104,14 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None get_dX(dY) model.finish_update(optimizer) if print_settings.get("print_after_training"): - msg.info(f"After training:") + msg.divider(f"STEP 2 - after training") _print_model(model, print_settings) # STEP 3: the final prediction prediction = model.predict(_get_docs()) if print_settings.get("print_prediction"): - msg.info(f"Prediction:", str(prediction)) + msg.divider(f"STEP 3 - prediction") + msg.info(str(prediction)) def get_gradient(model, Y): @@ -127,8 +128,8 @@ def _sentences(): ] -def _get_docs(): - nlp = English() +def _get_docs(lang: str = "en"): + nlp = util.get_lang_class(lang)() return list(nlp.pipe(_sentences())) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index cdbd7514a..e55e6e40e 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -7,23 +7,7 @@ import typer from ._util import app, Arg, Opt from .. import about from ..util import is_package, get_base_version, run_command - -# These are the old shortcuts we previously supported in spacy download. As of -# v3, shortcuts are deprecated so we're not expecting to add anything to this -# list. It only exists to show users warnings. -OLD_SHORTCUTS = { - "en": "en_core_web_sm", - "de": "de_core_news_sm", - "es": "es_core_news_sm", - "pt": "pt_core_news_sm", - "fr": "fr_core_news_sm", - "it": "it_core_news_sm", - "nl": "nl_core_news_sm", - "el": "el_core_news_sm", - "nb": "nb_core_news_sm", - "lt": "lt_core_news_sm", - "xx": "xx_ent_wiki_sm", -} +from ..errors import OLD_MODEL_SHORTCUTS @app.command( @@ -66,12 +50,12 @@ def download(model: str, direct: bool = False, *pip_args) -> None: download_model(dl_tpl.format(m=model_name, v=version), pip_args) else: model_name = model - if model in OLD_SHORTCUTS: + if model in OLD_MODEL_SHORTCUTS: msg.warn( - f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. " - f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead." + f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please" + f"use the full model name '{OLD_MODEL_SHORTCUTS[model]}' instead." ) - model_name = OLD_SHORTCUTS[model] + model_name = OLD_MODEL_SHORTCUTS[model] compatibility = get_compatibility() version = get_version(model_name, compatibility) download_model(dl_tpl.format(m=model_name, v=version), pip_args) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index ee1be57a3..5b434ee32 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,5 +1,4 @@ from typing import Optional, List, Dict -from timeit import default_timer as timer from wasabi import Printer from pathlib import Path import re @@ -64,9 +63,9 @@ def evaluate( msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) - corpus = Corpus(data_path, data_path) + corpus = Corpus(data_path, gold_preproc=gold_preproc) nlp = util.load_model(model) - dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) + dev_dataset = list(corpus(nlp)) scores = nlp.evaluate(dev_dataset, verbose=False) metrics = { "TOK": "token_acc", diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py new file mode 100644 index 000000000..01664ee40 --- /dev/null +++ b/spacy/cli/init_config.py @@ -0,0 +1,81 @@ +from typing import Optional, List +from pathlib import Path +from thinc.api import Config +from wasabi import msg + +from ..util import load_model_from_config, get_lang_class, load_model +from ._util import init_cli, Arg, Opt, show_validation_error + + +@init_cli.command("config") +def init_config_cli( + # fmt: off + output_path: Path = Arg("-", help="Output path or - for stdout", allow_dash=True), + base_path: Optional[Path] = Opt(None, "--base", "-b", help="Optional base config to fill", exists=True, dir_okay=False), + model: Optional[str] = Opt(None, "--model", "-m", help="Optional model to copy config from"), + lang: Optional[str] = Opt(None, "--lang", "-l", help="Optional language code for blank config"), + pipeline: Optional[str] = Opt(None, "--pipeline", "-p", help="Optional pipeline components to use") + # fmt: on +): + """Generate a starter config.cfg for training.""" + validate_cli_args(base_path, model, lang) + is_stdout = str(output_path) == "-" + pipeline = [p.strip() for p in pipeline.split(",")] if pipeline else [] + cfg = init_config(output_path, base_path, model, lang, pipeline, silent=is_stdout) + if is_stdout: + print(cfg.to_str()) + else: + cfg.to_disk(output_path) + msg.good("Saved config", output_path) + + +def init_config( + output_path: Path, + config_path: Optional[Path], + model: Optional[str], + lang: Optional[str], + pipeline: Optional[List[str]], + silent: bool = False, +) -> Config: + if config_path is not None: + msg.info("Generating config from base config", show=not silent) + with show_validation_error(config_path, hint_init=False): + config = Config().from_disk(config_path) + try: + nlp, _ = load_model_from_config(config, auto_fill=True) + except ValueError as e: + msg.fail(str(e), exits=1) + return nlp.config + if model is not None: + ext = f" with pipeline {pipeline}" if pipeline else "" + msg.info(f"Generating config from model {model}{ext}", show=not silent) + nlp = load_model(model) + for existing_pipe_name in nlp.pipe_names: + if existing_pipe_name not in pipeline: + nlp.remove_pipe(existing_pipe_name) + for pipe_name in pipeline: + if pipe_name not in nlp.pipe_names: + nlp.add_pipe(pipe_name) + return nlp.config + if lang is not None: + ext = f" with pipeline {pipeline}" if pipeline else "" + msg.info(f"Generating config for language '{lang}'{ext}", show=not silent) + nlp = get_lang_class(lang)() + for pipe_name in pipeline: + nlp.add_pipe(pipe_name) + return nlp.config + + +def validate_cli_args( + config_path: Optional[Path], model: Optional[str], lang: Optional[str] +) -> None: + args = {"--base": config_path, "--model": model, "--lang": lang} + if sum(arg is not None for arg in args.values()) != 1: + existing = " ".join(f"{a} {v}" for a, v in args.items() if v is not None) + msg.fail( + "The init config command expects only one of the following arguments: " + "--base (base config to fill and update), --lang (language code to " + "use for blank config) or --model (base model to copy config from).", + f"Got: {existing if existing else 'no arguments'}", + exits=1, + ) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index e1dca2395..4fdd2bbbc 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -10,14 +10,14 @@ import gzip import zipfile import srsly import warnings -from wasabi import Printer +from wasabi import msg, Printer +import typer -from ._util import app, Arg, Opt +from ._util import app, init_cli, Arg, Opt from ..vectors import Vectors from ..errors import Errors, Warnings from ..language import Language from ..util import ensure_path, get_lang_class, load_model, OOV_RANK -from ..lookups import Lookups try: import ftfy @@ -28,9 +28,15 @@ except ImportError: DEFAULT_OOV_PROB = -20 -@app.command("init-model") +@init_cli.command("model") +@app.command( + "init-model", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, + hidden=True, # hide this from main CLI help but still allow it to work with warning +) def init_model_cli( # fmt: off + ctx: typer.Context, # This is only used to read additional arguments lang: str = Arg(..., help="Model language"), output_dir: Path = Arg(..., help="Model output directory"), freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True), @@ -48,6 +54,12 @@ def init_model_cli( Create a new model from raw data. If vectors are provided in Word2Vec format, they can be either a .txt or zipped as a .zip or .tar.gz. """ + if ctx.command.name == "init-model": + msg.warn( + "The init-model command is now available via the 'init model' " + "subcommand (without the hyphen). You can run python -m spacy init " + "--help for an overview of the other available initialization commands." + ) init_model( lang, output_dir, diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 7d1a217be..7202ccacf 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -87,9 +87,9 @@ def pretrain( else: msg.info("Using CPU") msg.info(f"Loading config from: {config_path}") - config = Config().from_disk(config_path) - with show_validation_error(): - nlp, config = util.load_model_from_config(config, overrides=config_overrides) + with show_validation_error(config_path): + config = Config().from_disk(config_path, overrides=config_overrides) + nlp, config = util.load_model_from_config(config) # TODO: validate that [pretraining] block exists if not output_dir.exists(): output_dir.mkdir() diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index e42935e2f..3be784e04 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -1,7 +1,6 @@ from typing import Optional from pathlib import Path from wasabi import msg -import tqdm import re import shutil import requests diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 9cc36f77b..c5c6e7252 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -11,10 +11,10 @@ import random import typer from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code -from ..gold import Corpus, Example +from ._util import import_code, get_sourced_components from ..language import Language from .. import util +from ..gold.example import Example from ..errors import Errors @@ -28,8 +28,6 @@ from ..ml import models # noqa: F401 def train_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - train_path: Path = Arg(..., help="Location of training data", exists=True), - dev_path: Path = Arg(..., help="Location of development data", exists=True), config_path: Path = Arg(..., help="Path to config file", exists=True), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), @@ -51,12 +49,11 @@ def train_cli( referenced in the config. """ util.set_env_log(verbose) - verify_cli_args(train_path, dev_path, config_path, output_path) + verify_cli_args(config_path, output_path) overrides = parse_config_overrides(ctx.args) import_code(code_path) train( config_path, - {"train": train_path, "dev": dev_path}, output_path=output_path, config_overrides=overrides, use_gpu=use_gpu, @@ -66,8 +63,6 @@ def train_cli( def train( config_path: Path, - data_paths: Dict[str, Path], - raw_text: Optional[Path] = None, output_path: Optional[Path] = None, config_overrides: Dict[str, Any] = {}, use_gpu: int = -1, @@ -79,41 +74,37 @@ def train( else: msg.info("Using CPU") msg.info(f"Loading config and nlp from: {config_path}") - config = Config().from_disk(config_path) + with show_validation_error(config_path): + config = Config().from_disk(config_path, overrides=config_overrides) if config.get("training", {}).get("seed") is not None: fix_random_seed(config["training"]["seed"]) - with show_validation_error(): - nlp, config = util.load_model_from_config(config, overrides=config_overrides) - if config["training"]["base_model"]: - # TODO: do something to check base_nlp against regular nlp described in config? - # If everything matches it will look something like: - # base_nlp = util.load_model(config["training"]["base_model"]) - # nlp = base_nlp - raise NotImplementedError("base_model not supported yet.") + # Use original config here before it's resolved to functions + sourced_components = get_sourced_components(config) + with show_validation_error(config_path): + nlp, config = util.load_model_from_config(config) if config["training"]["vectors"] is not None: util.load_vectors_into_model(nlp, config["training"]["vectors"]) verify_config(nlp) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) - if config["training"]["use_pytorch_for_gpu_memory"]: + if config.get("system", {}).get("use_pytorch_for_gpu_memory"): # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() - training = config["training"] - optimizer = training["optimizer"] - limit = training["limit"] - corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit) - if resume_training: - msg.info("Resuming training") - nlp.resume_training() - else: - msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") - train_examples = corpus.train_dataset( - nlp, - shuffle=False, - gold_preproc=training["gold_preproc"], - max_length=training["max_length"], - ) - train_examples = list(train_examples) - nlp.begin_training(lambda: train_examples) + T_cfg = config["training"] + optimizer = T_cfg["optimizer"] + train_corpus = T_cfg["train_corpus"] + dev_corpus = T_cfg["dev_corpus"] + batcher = T_cfg["batcher"] + # Components that shouldn't be updated during training + frozen_components = T_cfg["frozen_components"] + # Sourced components that require resume_training + resume_components = [p for p in sourced_components if p not in frozen_components] + msg.info(f"Pipeline: {nlp.pipe_names}") + if resume_components: + with nlp.select_pipes(enable=resume_components): + msg.info(f"Resuming training for: {resume_components}") + nlp.resume_training() + with nlp.select_pipes(disable=[*frozen_components, *resume_components]): + nlp.begin_training(lambda: train_corpus(nlp)) if tag_map: # Replace tag map with provided mapping @@ -139,38 +130,36 @@ def train( msg.fail(err, exits=1) tok2vec.from_bytes(weights_data) - msg.info("Loading training corpus") - train_batches = create_train_batches(nlp, corpus, training) - evaluate = create_evaluation_callback(nlp, optimizer, corpus, training) - # Create iterator, which yields out info after each optimization step. msg.info("Start training") + score_weights = T_cfg["score_weights"] training_step_iterator = train_while_improving( nlp, optimizer, - train_batches, - evaluate, - dropout=training["dropout"], - accumulate_gradient=training["accumulate_gradient"], - patience=training["patience"], - max_steps=training["max_steps"], - eval_frequency=training["eval_frequency"], - raw_text=raw_text, + create_train_batches(train_corpus(nlp), batcher, T_cfg["max_epochs"]), + create_evaluation_callback(nlp, dev_corpus, score_weights), + dropout=T_cfg["dropout"], + accumulate_gradient=T_cfg["accumulate_gradient"], + patience=T_cfg["patience"], + max_steps=T_cfg["max_steps"], + eval_frequency=T_cfg["eval_frequency"], + raw_text=None, + exclude=frozen_components, ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") - print_row = setup_printer(training, nlp) + print_row = setup_printer(T_cfg, nlp) try: - progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) + progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) for batch, info, is_best_checkpoint in training_step_iterator: progress.update(1) if is_best_checkpoint is not None: progress.close() print_row(info) if is_best_checkpoint and output_path is not None: - update_meta(training, nlp, info) + update_meta(T_cfg, nlp, info) nlp.to_disk(output_path / "model-best") - progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) + progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) except Exception as e: if output_path is not None: msg.warn( @@ -191,72 +180,32 @@ def train( msg.good(f"Saved model to output directory {final_model_path}") -def create_train_batches( - nlp: Language, corpus: Corpus, cfg: Union[Config, Dict[str, Any]] -): - max_epochs = cfg["max_epochs"] - train_examples = list( - corpus.train_dataset( - nlp, - shuffle=True, - gold_preproc=cfg["gold_preproc"], - max_length=cfg["max_length"], - ) - ) - epoch = 0 - batch_strategy = cfg["batch_by"] - while True: - if len(train_examples) == 0: - raise ValueError(Errors.E988) - epoch += 1 - if batch_strategy == "padded": - batches = util.minibatch_by_padded_size( - train_examples, - size=cfg["batch_size"], - buffer=256, - discard_oversize=cfg["discard_oversize"], - ) - elif batch_strategy == "words": - batches = util.minibatch_by_words( - train_examples, - size=cfg["batch_size"], - discard_oversize=cfg["discard_oversize"], - ) - else: - batches = util.minibatch(train_examples, size=cfg["batch_size"]) - # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop - try: - first = next(batches) - yield epoch, first - except StopIteration: - raise ValueError(Errors.E986) - for batch in batches: +def create_train_batches(iterator, batcher, max_epochs: int): + epoch = 1 + examples = [] + # Stream the first epoch, so we start training faster and support + # infinite streams. + for batch in batcher(iterator): + yield epoch, batch + if max_epochs != 1: + examples.extend(batch) + if not examples: + # Raise error if no data + raise ValueError(Errors.E986) + while epoch != max_epochs: + random.shuffle(examples) + for batch in batcher(examples): yield epoch, batch - if max_epochs >= 1 and epoch >= max_epochs: - break - random.shuffle(train_examples) + epoch += 1 def create_evaluation_callback( - nlp: Language, - optimizer: Optimizer, - corpus: Corpus, - cfg: Union[Config, Dict[str, Any]], + nlp: Language, dev_corpus: Callable, weights: Dict[str, float], ) -> Callable[[], Tuple[float, Dict[str, float]]]: def evaluate() -> Tuple[float, Dict[str, float]]: - dev_examples = corpus.dev_dataset( - nlp, gold_preproc=cfg["gold_preproc"] - ) - dev_examples = list(dev_examples) - n_words = sum(len(ex.predicted) for ex in dev_examples) - batch_size = cfg["eval_batch_size"] - if optimizer.averages: - with nlp.use_params(optimizer.averages): - scores = nlp.evaluate(dev_examples, batch_size=batch_size) - else: - scores = nlp.evaluate(dev_examples, batch_size=batch_size) + dev_examples = list(dev_corpus(nlp)) + scores = nlp.evaluate(dev_examples) # Calculate a weighted sum based on score_weights for the main score - weights = cfg["score_weights"] try: weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) except KeyError as e: @@ -280,6 +229,7 @@ def train_while_improving( patience: int, max_steps: int, raw_text: List[Dict[str, str]], + exclude: List[str], ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -325,8 +275,6 @@ def train_while_improving( dropouts = dropout results = [] losses = {} - to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")] - if raw_text: random.shuffle(raw_text) raw_examples = [ @@ -336,20 +284,26 @@ def train_while_improving( for step, (epoch, batch) in enumerate(train_data): dropout = next(dropouts) - with nlp.select_pipes(enable=to_enable): - for subbatch in subdivide_batch(batch, accumulate_gradient): - nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) - if raw_text: - # If raw text is available, perform 'rehearsal' updates, - # which use unlabelled data to reduce overfitting. - raw_batch = list(next(raw_batches)) - nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) - for name, proc in nlp.pipeline: - if hasattr(proc, "model"): - proc.model.finish_update(optimizer) + for subbatch in subdivide_batch(batch, accumulate_gradient): + nlp.update( + subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude + ) + if raw_text: + # If raw text is available, perform 'rehearsal' updates, + # which use unlabelled data to reduce overfitting. + raw_batch = list(next(raw_batches)) + nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude) + # TODO: refactor this so we don't have to run it separately in here + for name, proc in nlp.pipeline: + if name not in exclude and hasattr(proc, "model"): + proc.model.finish_update(optimizer) optimizer.step_schedules() if not (step % eval_frequency): - score, other_scores = evaluate() + if optimizer.averages: + with nlp.use_params(optimizer.averages): + score, other_scores = evaluate() + else: + score, other_scores = evaluate() results.append((score, step)) is_best_checkpoint = score == max(results)[0] else: @@ -460,17 +414,7 @@ def load_from_paths( msg.fail("Can't find raw text", raw_text, exits=1) raw_text = list(srsly.read_jsonl(config["training"]["raw_text"])) tag_map = {} - tag_map_path = util.ensure_path(config["training"]["tag_map"]) - if tag_map_path is not None: - if not tag_map_path.exists(): - msg.fail("Can't find tag map path", tag_map_path, exits=1) - tag_map = srsly.read_json(config["training"]["tag_map"]) morph_rules = {} - morph_rules_path = util.ensure_path(config["training"]["morph_rules"]) - if morph_rules_path is not None: - if not morph_rules_path.exists(): - msg.fail("Can't find tag map path", morph_rules_path, exits=1) - morph_rules = srsly.read_json(config["training"]["morph_rules"]) weights_data = None init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"]) if init_tok2vec is not None: @@ -481,19 +425,10 @@ def load_from_paths( return raw_text, tag_map, morph_rules, weights_data -def verify_cli_args( - train_path: Path, - dev_path: Path, - config_path: Path, - output_path: Optional[Path] = None, -) -> None: +def verify_cli_args(config_path: Path, output_path: Optional[Path] = None,) -> None: # Make sure all files and paths exists if they are needed if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) - if not train_path or not train_path.exists(): - msg.fail("Training data not found", train_path, exits=1) - if not dev_path or not dev_path.exists(): - msg.fail("Development data not found", dev_path, exits=1) if output_path is not None: if not output_path.exists(): output_path.mkdir() diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 0580d34c5..e6ba284df 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -13,8 +13,9 @@ from ..util import get_package_path, get_model_meta, is_compatible_version @app.command("validate") def validate_cli(): """ - Validate that the currently installed version of spaCy is compatible - with the installed models. Should be run after `pip install -U spacy`. + Validate the currently installed models and spaCy version. Checks if the + installed models are compatible and shows upgrade instructions if available. + Should be run after `pip install -U spacy`. """ validate() diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index fead996ba..353924280 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,7 +1,20 @@ +[paths] +train = "" +dev = "" +raw = null +init_tok2vec = null + +[system] +seed = 0 +use_pytorch_for_gpu_memory = false + [nlp] lang = null pipeline = [] load_vocab_data = true +before_creation = null +after_creation = null +after_pipeline_creation = null [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" @@ -13,38 +26,57 @@ load_vocab_data = true # Training hyper-parameters and additional features. [training] -# Whether to train on sequences with 'gold standard' sentence boundaries -# and tokens. If you set this to true, take care to ensure your run-time -# data is passed in sentence-by-sentence via some prior preprocessing. -gold_preproc = false -# Limitations on training document length or number of examples. -max_length = 5000 -limit = 0 -# Data augmentation -orth_variant_level = 0.0 +seed = ${system:seed} dropout = 0.1 +accumulate_gradient = 1 +# Extra resources for transfer-learning or pseudo-rehearsal +init_tok2vec = ${paths:init_tok2vec} +raw_text = ${paths:raw} +vectors = null # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 max_epochs = 0 max_steps = 20000 eval_frequency = 200 -eval_batch_size = 128 -# Other settings -seed = 0 -accumulate_gradient = 1 -use_pytorch_for_gpu_memory = false # Control how scores are printed and checkpoints are evaluated. score_weights = {} -# These settings are invalid for the transformer models. -init_tok2vec = null +# Names of pipeline components that shouldn't be updated during training +frozen_components = [] + +[training.train_corpus] +@readers = "spacy.Corpus.v1" +path = ${paths:train} +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length +max_length = 2000 +# Limitation on number of training examples +limit = 0 + +[training.dev_corpus] +@readers = "spacy.Corpus.v1" +path = ${paths:dev} +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length +max_length = 2000 +# Limitation on number of training examples +limit = 0 + +[training.batcher] +@batchers = "batch_by_words.v1" discard_oversize = false -raw_text = null -tag_map = null -morph_rules = null -base_model = null -vectors = null -batch_by = "words" -batch_size = 1000 +tolerance = 0.2 + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 [training.optimizer] @optimizers = "Adam.v1" @@ -69,8 +101,8 @@ max_length = 500 dropout = 0.2 n_save_every = null batch_size = 3000 -seed = ${training:seed} -use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory} +seed = ${system:seed} +use_pytorch_for_gpu_memory = ${system:use_pytorch_for_gpu_memory} tok2vec_model = "components.tok2vec.model" [pretraining.objective] diff --git a/spacy/errors.py b/spacy/errors.py index 3fe53d6db..5c443ccad 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -63,8 +63,6 @@ class Warnings: "have the spacy-lookups-data package installed.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " "the Knowledge Base.") - W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " - "previous components in the pipeline declare that they assign it.") W026 = ("Unable to set all sentence boundaries from dependency parses.") W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " @@ -376,7 +374,8 @@ class Errors: E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input " "includes either the `text` or `tokens` key. For more info, see " "the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl") - E139 = ("Knowledge Base for component '{name}' is empty.") + E139 = ("Knowledge Base for component '{name}' is empty. Use the methods " + "kb.add_entity and kb.add_alias to add entries.") E140 = ("The list of entities, prior probabilities and entity vectors " "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " @@ -483,10 +482,31 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E941 = ("Can't find model '{name}'. It looks like you're trying to load a " + "model from a shortcut, which is deprecated as of spaCy v3.0. To " + "load the model, use its full name instead:\n\n" + "nlp = spacy.load(\"{full}\")\n\nFor more details on the available " + "models, see the models directory: https://spacy.io/models. If you " + "want to create a blank model, use spacy.blank: " + "nlp = spacy.blank(\"{name}\")") + E942 = ("Executing after_{name} callback failed. Expected the function to " + "return an initialized nlp object but got: {value}. Maybe " + "you forgot to return the modified object in your function?") + E943 = ("Executing before_creation callback failed. Expected the function to " + "return an uninitialized Language subclass but got: {value}. Maybe " + "you forgot to return the modified object in your function or " + "returned the initialized nlp object instead?") + E944 = ("Can't copy pipeline component '{name}' from source model '{model}': " + "not found in pipeline. Available components: {opts}") + E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded " + "nlp object, but got: {source}") + E946 = ("The Vocab for the knowledge base is not initialized. Did you forget to " + "call kb.initialize()?") E947 = ("Matcher.add received invalid 'greedy' argument: expected " "a string value from {expected} but got: '{arg}'") E948 = ("Matcher.add received invalid 'patterns' argument: expected " "a List, but got: {arg_type}") + E949 = ("Can only create an alignment when the texts are the same.") E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive a valid input.") @@ -569,11 +589,13 @@ class Errors: "into {values}, but found {value}.") E983 = ("Invalid key for '{dict}': {key}. Available keys: " "{keys}") - E984 = ("Invalid component config for '{name}': no 'factory' key " - "specifying the registered function used to initialize the " - "component. For example, factory = \"ner\" will use the 'ner' " - "factory and all other settings in the block will be passed " - "to it as arguments.\n\n{config}") + E984 = ("Invalid component config for '{name}': component block needs either " + "a key 'factory' specifying the registered function used to " + "initialize the component, or a key 'source' key specifying a " + "spaCy model to copy the component from. For example, factory = " + "\"ner\" will use the 'ner' factory and all other settings in the " + "block will be passed to it as arguments. Alternatively, source = " + "\"en_core_web_sm\" will copy the component from that model.\n\n{config}") E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}") E986 = ("Could not create any training batches: check your input. " "Perhaps discard_oversize should be set to False ?") @@ -608,6 +630,9 @@ class Errors: "initializing the pipeline:\n" 'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n' 'nlp = Chinese(config=cfg)') + E1001 = ("Target token outside of matched span for match with tokens " + "'{span}' and offset '{index}' matched by patterns '{patterns}'.") + E1002 = ("Span index out of range.") @add_codes @@ -617,6 +642,15 @@ class TempErrors: "issue tracker: http://github.com/explosion/spaCy/issues") +# Deprecated model shortcuts, only used in errors and warnings +OLD_MODEL_SHORTCUTS = { + "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm", + "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm", + "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm", + "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm" +} + + # fmt: on diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py index c8b5fc44d..142c6b3a7 100644 --- a/spacy/gold/__init__.py +++ b/spacy/gold/__init__.py @@ -1,11 +1,8 @@ -from .corpus import Corpus -from .example import Example -from .align import Alignment - -from .iob_utils import iob_to_biluo, biluo_to_iob -from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags -from .iob_utils import spans_from_biluo_tags -from .iob_utils import tags_to_entities - -from .gold_io import docs_to_json -from .gold_io import read_json_file +from .corpus import Corpus # noqa: F401 +from .example import Example # noqa: F401 +from .align import Alignment # noqa: F401 +from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 +from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401 +from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401 +from .gold_io import docs_to_json, read_json_file # noqa: F401 +from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401 diff --git a/spacy/gold/align.py b/spacy/gold/align.py index af70ee5b7..e8f17a667 100644 --- a/spacy/gold/align.py +++ b/spacy/gold/align.py @@ -4,6 +4,8 @@ from thinc.types import Ragged from dataclasses import dataclass import tokenizations +from ..errors import Errors + @dataclass class Alignment: @@ -18,6 +20,8 @@ class Alignment: @classmethod def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": + if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower(): + raise ValueError(Errors.E949) x2y, y2x = tokenizations.get_alignments(A, B) return Alignment.from_indices(x2y=x2y, y2x=y2x) diff --git a/spacy/gold/batchers.py b/spacy/gold/batchers.py new file mode 100644 index 000000000..57c6b4b3a --- /dev/null +++ b/spacy/gold/batchers.py @@ -0,0 +1,171 @@ +from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable +from typing import Optional, Any +from functools import partial +import itertools + +from ..util import registry, minibatch + + +Sizing = Union[Iterable[int], int] +ItemT = TypeVar("ItemT") +BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]] + + +@registry.batchers("batch_by_padded.v1") +def configure_minibatch_by_padded_size( + *, + size: Sizing, + buffer: int, + discard_oversize: bool, + get_length: Optional[Callable[[ItemT], int]] = None +) -> BatcherT: + # Avoid displacing optional values from the underlying function. + optionals = {"get_length": get_length} if get_length is not None else {} + return partial( + minibatch_by_padded_size, + size=size, + buffer=buffer, + discard_oversize=discard_oversize, + **optionals + ) + + +@registry.batchers("batch_by_words.v1") +def configure_minibatch_by_words( + *, + size: Sizing, + tolerance: float, + discard_oversize: bool, + get_length: Optional[Callable[[ItemT], int]] = None +) -> BatcherT: + optionals = {"get_length": get_length} if get_length is not None else {} + return partial( + minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals + ) + + +@registry.batchers("batch_by_sequence.v1") +def configure_minibatch( + size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None +) -> BatcherT: + optionals = {"get_length": get_length} if get_length is not None else {} + return partial(minibatch, size=size, **optionals) + + +def minibatch_by_padded_size( + docs: Iterator["Doc"], + size: Sizing, + buffer: int = 256, + discard_oversize: bool = False, + get_length: Callable = len, +) -> Iterator[Iterator["Doc"]]: + if isinstance(size, int): + size_ = itertools.repeat(size) + else: + size_ = size + for outer_batch in minibatch(docs, size=buffer): + outer_batch = list(outer_batch) + target_size = next(size_) + for indices in _batch_by_length(outer_batch, target_size, get_length): + subbatch = [outer_batch[i] for i in indices] + padded_size = max(len(seq) for seq in subbatch) * len(subbatch) + if discard_oversize and padded_size >= target_size: + pass + else: + yield subbatch + + +def minibatch_by_words( + docs, size, tolerance=0.2, discard_oversize=False, get_length=len +): + """Create minibatches of roughly a given number of words. If any examples + are longer than the specified batch length, they will appear in a batch by + themselves, or be discarded if discard_oversize=True. + The argument 'docs' can be a list of strings, Docs or Examples. + """ + if isinstance(size, int): + size_ = itertools.repeat(size) + elif isinstance(size, List): + size_ = iter(size) + else: + size_ = size + target_size = next(size_) + tol_size = target_size * tolerance + batch = [] + overflow = [] + batch_size = 0 + overflow_size = 0 + for doc in docs: + n_words = get_length(doc) + # if the current example exceeds the maximum batch size, it is returned separately + # but only if discard_oversize=False. + if n_words > target_size + tol_size: + if not discard_oversize: + yield [doc] + # add the example to the current batch if there's no overflow yet and it still fits + elif overflow_size == 0 and (batch_size + n_words) <= target_size: + batch.append(doc) + batch_size += n_words + # add the example to the overflow buffer if it fits in the tolerance margin + elif (batch_size + overflow_size + n_words) <= (target_size + tol_size): + overflow.append(doc) + overflow_size += n_words + # yield the previous batch and start a new one. The new one gets the overflow examples. + else: + if batch: + yield batch + target_size = next(size_) + tol_size = target_size * tolerance + batch = overflow + batch_size = overflow_size + overflow = [] + overflow_size = 0 + # this example still fits + if (batch_size + n_words) <= target_size: + batch.append(doc) + batch_size += n_words + # this example fits in overflow + elif (batch_size + n_words) <= (target_size + tol_size): + overflow.append(doc) + overflow_size += n_words + # this example does not fit with the previous overflow: start another new batch + else: + if batch: + yield batch + target_size = next(size_) + tol_size = target_size * tolerance + batch = [doc] + batch_size = n_words + batch.extend(overflow) + if batch: + yield batch + + +def _batch_by_length( + seqs: Sequence[Any], max_words: int, get_length=len +) -> List[List[Any]]: + """Given a list of sequences, return a batched list of indices into the + list, where the batches are grouped by length, in descending order. + + Batches may be at most max_words in size, defined as max sequence length * size. + """ + # Use negative index so we can get sort by position ascending. + lengths_indices = [(get_length(seq), i) for i, seq in enumerate(seqs)] + lengths_indices.sort() + batches = [] + batch = [] + for length, i in lengths_indices: + if not batch: + batch.append(i) + elif length * (len(batch) + 1) <= max_words: + batch.append(i) + else: + batches.append(batch) + batch = [i] + if batch: + batches.append(batch) + # Check lengths match + assert sum(len(b) for b in batches) == len(seqs) + batches = [list(sorted(batch)) for batch in batches] + batches.reverse() + return batches diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py index 63d52ad9d..15f025a08 100644 --- a/spacy/gold/converters/__init__.py +++ b/spacy/gold/converters/__init__.py @@ -1,4 +1,4 @@ from .iob2docs import iob2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401 -from .json2docs import json2docs +from .json2docs import json2docs # noqa: F401 from .conllu2docs import conllu2docs # noqa: F401 diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index d23f70bee..4a65d8885 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -1,6 +1,5 @@ -from typing import Union, List, Iterable, Iterator, TYPE_CHECKING +from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable from pathlib import Path -import random from .. import util from .example import Example @@ -12,26 +11,43 @@ if TYPE_CHECKING: from ..language import Language # noqa: F401 +@util.registry.readers("spacy.Corpus.v1") +def create_docbin_reader( + path: Path, gold_preproc: bool, max_length: int = 0, limit: int = 0 +) -> Callable[["Language"], Iterable[Example]]: + return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit) + + class Corpus: - """An annotated corpus, reading train and dev datasets from - the DocBin (.spacy) format. + """Iterate Example objects from a file or directory of DocBin (.spacy) + formated data files. + + path (Path): The directory or filename to read from. + gold_preproc (bool): Whether to set up the Example object with gold-standard + sentences and tokens for the predictions. Gold preprocessing helps + the annotations align to the tokenization, and may result in sequences + of more consistent length. However, it may reduce run-time accuracy due + to train/test skew. Defaults to False. + max_length (int): Maximum document length. Longer documents will be + split into sentences, if sentence boundaries are available. Defaults to + 0, which indicates no limit. + limit (int): Limit corpus to a subset of examples, e.g. for debugging. + Defaults to 0, which indicates no limit. DOCS: https://spacy.io/api/corpus """ def __init__( - self, train_loc: Union[str, Path], dev_loc: Union[str, Path], limit: int = 0 + self, + path, + *, + limit: int = 0, + gold_preproc: bool = False, + max_length: bool = False, ) -> None: - """Create a Corpus. - - train (str / Path): File or directory of training data. - dev (str / Path): File or directory of development data. - limit (int): Max. number of examples returned. - - DOCS: https://spacy.io/api/corpus#init - """ - self.train_loc = train_loc - self.dev_loc = dev_loc + self.path = util.ensure_path(path) + self.gold_preproc = gold_preproc + self.max_length = max_length self.limit = limit @staticmethod @@ -54,6 +70,21 @@ class Corpus: locs.append(path) return locs + def __call__(self, nlp: "Language") -> Iterator[Example]: + """Yield examples from the data. + + nlp (Language): The current nlp object. + YIELDS (Example): The examples. + + DOCS: https://spacy.io/api/corpus#call + """ + ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.path)) + if self.gold_preproc: + examples = self.make_examples_gold_preproc(nlp, ref_docs) + else: + examples = self.make_examples(nlp, ref_docs, self.max_length) + yield from examples + def _make_example( self, nlp: "Language", reference: Doc, gold_preproc: bool ) -> Example: @@ -114,68 +145,3 @@ class Corpus: i += 1 if self.limit >= 1 and i >= self.limit: break - - def count_train(self, nlp: "Language") -> int: - """Returns count of words in train examples. - - nlp (Language): The current nlp. object. - RETURNS (int): The word count. - - DOCS: https://spacy.io/api/corpus#count_train - """ - n = 0 - i = 0 - for example in self.train_dataset(nlp): - n += len(example.predicted) - if self.limit >= 0 and i >= self.limit: - break - i += 1 - return n - - def train_dataset( - self, - nlp: "Language", - *, - shuffle: bool = True, - gold_preproc: bool = False, - max_length: int = 0 - ) -> Iterator[Example]: - """Yield examples from the training data. - - nlp (Language): The current nlp object. - shuffle (bool): Whether to shuffle the examples. - gold_preproc (bool): Whether to train on gold-standard sentences and tokens. - max_length (int): Maximum document length. Longer documents will be - split into sentences, if sentence boundaries are available. 0 for - no limit. - YIELDS (Example): The examples. - - DOCS: https://spacy.io/api/corpus#train_dataset - """ - ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) - if gold_preproc: - examples = self.make_examples_gold_preproc(nlp, ref_docs) - else: - examples = self.make_examples(nlp, ref_docs, max_length) - if shuffle: - examples = list(examples) - random.shuffle(examples) - yield from examples - - def dev_dataset( - self, nlp: "Language", *, gold_preproc: bool = False - ) -> Iterator[Example]: - """Yield examples from the development data. - - nlp (Language): The current nlp object. - gold_preproc (bool): Whether to train on gold-standard sentences and tokens. - YIELDS (Example): The examples. - - DOCS: https://spacy.io/api/corpus#dev_dataset - """ - ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) - if gold_preproc: - examples = self.make_examples_gold_preproc(nlp, ref_docs) - else: - examples = self.make_examples(nlp, ref_docs, max_length=0) - yield from examples diff --git a/spacy/gold/example.pxd b/spacy/gold/example.pxd index 1f63b12d0..e06e36287 100644 --- a/spacy/gold/example.pxd +++ b/spacy/gold/example.pxd @@ -4,4 +4,6 @@ from ..tokens.doc cimport Doc cdef class Example: cdef readonly Doc x cdef readonly Doc y - cdef readonly object _alignment + cdef readonly object _cached_alignment + cdef readonly object _cached_words_x + cdef readonly object _cached_words_y diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 9101cefce..f90d98603 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -10,7 +10,7 @@ from .align import Alignment from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .iob_utils import spans_from_biluo_tags from ..errors import Errors, Warnings -from ..syntax import nonproj +from ..pipeline._parser_internals import nonproj cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): @@ -32,9 +32,9 @@ cdef class Example: raise TypeError(Errors.E972.format(arg="predicted")) if reference is None: raise TypeError(Errors.E972.format(arg="reference")) - self.x = predicted - self.y = reference - self._alignment = alignment + self.predicted = predicted + self.reference = reference + self._cached_alignment = alignment def __len__(self): return len(self.predicted) @@ -45,7 +45,8 @@ cdef class Example: def __set__(self, doc): self.x = doc - self._alignment = None + self._cached_alignment = None + self._cached_words_x = [t.text for t in doc] property reference: def __get__(self): @@ -53,7 +54,8 @@ cdef class Example: def __set__(self, doc): self.y = doc - self._alignment = None + self._cached_alignment = None + self._cached_words_y = [t.text for t in doc] def copy(self): return Example( @@ -79,13 +81,15 @@ cdef class Example: @property def alignment(self): - if self._alignment is None: - spacy_words = [token.orth_ for token in self.predicted] - gold_words = [token.orth_ for token in self.reference] - if gold_words == []: - gold_words = spacy_words - self._alignment = Alignment.from_strings(spacy_words, gold_words) - return self._alignment + words_x = [token.text for token in self.x] + words_y = [token.text for token in self.y] + if self._cached_alignment is None or \ + words_x != self._cached_words_x or \ + words_y != self._cached_words_y: + self._cached_alignment = Alignment.from_strings(words_x, words_y) + self._cached_words_x = words_x + self._cached_words_y = words_y + return self._cached_alignment def get_aligned(self, field, as_string=False): """Return an aligned array for a token attribute.""" @@ -179,15 +183,15 @@ cdef class Example: "links": self._links_to_dict() }, "token_annotation": { - "ids": [t.i+1 for t in self.reference], - "words": [t.text for t in self.reference], - "tags": [t.tag_ for t in self.reference], - "lemmas": [t.lemma_ for t in self.reference], - "pos": [t.pos_ for t in self.reference], - "morphs": [t.morph_ for t in self.reference], - "heads": [t.head.i for t in self.reference], - "deps": [t.dep_ for t in self.reference], - "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference] + "ORTH": [t.text for t in self.reference], + "SPACY": [bool(t.whitespace_) for t in self.reference], + "TAG": [t.tag_ for t in self.reference], + "LEMMA": [t.lemma_ for t in self.reference], + "POS": [t.pos_ for t in self.reference], + "MORPH": [t.morph_ for t in self.reference], + "HEAD": [t.head.i for t in self.reference], + "DEP": [t.dep_ for t in self.reference], + "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference] } } @@ -331,10 +335,14 @@ def _fix_legacy_dict_data(example_dict): for key, value in old_token_dict.items(): if key in ("text", "ids", "brackets"): pass + elif key in remapping.values(): + token_dict[key] = value elif key.lower() in remapping: token_dict[remapping[key.lower()]] = value else: - raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys())) + all_keys = set(remapping.values()) + all_keys.update(remapping.keys()) + raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=all_keys)) text = example_dict.get("text", example_dict.get("raw")) if _has_field(token_dict, "ORTH") and not _has_field(token_dict, "SPACY"): token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"]) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 3f226596c..9035f7e6a 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -71,17 +71,25 @@ cdef class KnowledgeBase: DOCS: https://spacy.io/api/kb """ - def __init__(self, Vocab vocab, entity_vector_length=64): - self.vocab = vocab + def __init__(self, entity_vector_length): + """Create a KnowledgeBase. Make sure to call kb.initialize() before using it.""" self.mem = Pool() self.entity_vector_length = entity_vector_length self._entry_index = PreshMap() self._alias_index = PreshMap() + self.vocab = None + + def initialize(self, Vocab vocab): + self.vocab = vocab self.vocab.strings.add("") self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) + def require_vocab(self): + if self.vocab is None: + raise ValueError(Errors.E946) + @property def entity_vector_length(self): """RETURNS (uint64): length of the entity vectors""" @@ -94,12 +102,14 @@ cdef class KnowledgeBase: return len(self._entry_index) def get_entity_strings(self): + self.require_vocab() return [self.vocab.strings[x] for x in self._entry_index] def get_size_aliases(self): return len(self._alias_index) def get_alias_strings(self): + self.require_vocab() return [self.vocab.strings[x] for x in self._alias_index] def add_entity(self, unicode entity, float freq, vector[float] entity_vector): @@ -107,6 +117,7 @@ cdef class KnowledgeBase: Add an entity to the KB, optionally specifying its log probability based on corpus frequency Return the hash of the entity ID/name at the end. """ + self.require_vocab() cdef hash_t entity_hash = self.vocab.strings.add(entity) # Return if this entity was added before @@ -129,6 +140,7 @@ cdef class KnowledgeBase: return entity_hash cpdef set_entities(self, entity_list, freq_list, vector_list): + self.require_vocab() if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list): raise ValueError(Errors.E140) @@ -164,10 +176,12 @@ cdef class KnowledgeBase: i += 1 def contains_entity(self, unicode entity): + self.require_vocab() cdef hash_t entity_hash = self.vocab.strings.add(entity) return entity_hash in self._entry_index def contains_alias(self, unicode alias): + self.require_vocab() cdef hash_t alias_hash = self.vocab.strings.add(alias) return alias_hash in self._alias_index @@ -176,6 +190,7 @@ cdef class KnowledgeBase: For a given alias, add its potential entities and prior probabilies to the KB. Return the alias_hash at the end """ + self.require_vocab() # Throw an error if the length of entities and probabilities are not the same if not len(entities) == len(probabilities): raise ValueError(Errors.E132.format(alias=alias, @@ -219,6 +234,7 @@ cdef class KnowledgeBase: Throw an error if this entity+prior prob would exceed the sum of 1. For efficiency, it's best to use the method `add_alias` as much as possible instead of this one. """ + self.require_vocab() # Check if the alias exists in the KB cdef hash_t alias_hash = self.vocab.strings[alias] if not alias_hash in self._alias_index: @@ -265,6 +281,7 @@ cdef class KnowledgeBase: and the prior probability of that alias resolving to that entity. If the alias is not known in the KB, and empty list is returned. """ + self.require_vocab() cdef hash_t alias_hash = self.vocab.strings[alias] if not alias_hash in self._alias_index: return [] @@ -281,6 +298,7 @@ cdef class KnowledgeBase: if entry_index != 0] def get_vector(self, unicode entity): + self.require_vocab() cdef hash_t entity_hash = self.vocab.strings[entity] # Return an empty list if this entity is unknown in this KB @@ -293,6 +311,7 @@ cdef class KnowledgeBase: def get_prior_prob(self, unicode entity, unicode alias): """ Return the prior probability of a given alias being linked to a given entity, or return 0.0 when this combination is not known in the knowledge base""" + self.require_vocab() cdef hash_t alias_hash = self.vocab.strings[alias] cdef hash_t entity_hash = self.vocab.strings[entity] @@ -311,6 +330,7 @@ cdef class KnowledgeBase: def dump(self, loc): + self.require_vocab() cdef Writer writer = Writer(loc) writer.write_header(self.get_size_entities(), self.entity_vector_length) diff --git a/spacy/language.py b/spacy/language.py index e415869b3..e9d7e9eb6 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,7 +18,7 @@ from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab -from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs +from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .gold import Example from .scorer import Scorer from .util import create_default_optimizer, registry @@ -37,8 +37,6 @@ from . import util from . import about -# TODO: integrate pipeline analyis -ENABLE_PIPELINE_ANALYSIS = False # This is the base config will all settings (training etc.) DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH) @@ -522,6 +520,25 @@ class Language: return add_component(func) return add_component + def analyze_pipes( + self, + *, + keys: List[str] = ["assigns", "requires", "scores", "retokenizes"], + pretty: bool = False, + ) -> Optional[Dict[str, Any]]: + """Analyze the current pipeline components, print a summary of what + they assign or require and check that all requirements are met. + + keys (List[str]): The meta values to display in the table. Corresponds + to values in FactoryMeta, defined by @Language.factory decorator. + pretty (bool): Pretty-print the results. + RETURNS (dict): The data. + """ + analysis = analyze_pipes(self, keys=keys) + if pretty: + print_pipe_analysis(analysis, keys=keys) + return analysis + def get_pipe(self, name: str) -> Callable[[Doc], Doc]: """Get a pipeline component for a given component name. @@ -541,7 +558,6 @@ class Language: name: Optional[str] = None, *, config: Optional[Dict[str, Any]] = SimpleFrozenDict(), - overrides: Optional[Dict[str, Any]] = SimpleFrozenDict(), validate: bool = True, ) -> Callable[[Doc], Doc]: """Create a pipeline component. Mostly used internally. To create and @@ -552,8 +568,6 @@ class Language: Defaults to factory name if not set. config (Optional[Dict[str, Any]]): Config parameters to use for this component. Will be merged with default config, if available. - overrides (Optional[Dict[str, Any]]): Config overrides, typically - passed in via the CLI. validate (bool): Whether to validate the component config against the arguments and types expected by the factory. RETURNS (Callable[[Doc], Doc]): The pipeline component. @@ -596,13 +610,39 @@ class Language: # registered functions twice # TODO: customize validation to make it more readable / relate it to # pipeline component and why it failed, explain default config - resolved, filled = registry.resolve(cfg, validate=validate, overrides=overrides) + resolved, filled = registry.resolve(cfg, validate=validate) filled = filled[factory_name] filled["factory"] = factory_name filled.pop("@factories", None) self._pipe_configs[name] = filled return resolved[factory_name] + def create_pipe_from_source( + self, source_name: str, source: "Language", *, name: str, + ) -> Tuple[Callable[[Doc], Doc], str]: + """Create a pipeline component by copying it from an existing model. + + source_name (str): Name of the component in the source pipeline. + source (Language): The source nlp object to copy from. + name (str): Optional alternative name to use in current pipeline. + RETURNS (Tuple[Callable, str]): The component and its factory name. + """ + # TODO: handle errors and mismatches (vectors etc.) + if not isinstance(source, self.__class__): + raise ValueError(Errors.E945.format(name=source_name, source=type(source))) + if not source.has_pipe(source_name): + raise KeyError( + Errors.E944.format( + name=source_name, + model=f"{source.meta['lang']}_{source.meta['name']}", + opts=", ".join(source.pipe_names), + ) + ) + pipe = source.get_pipe(source_name) + pipe_config = util.copy_config(source.config["components"][source_name]) + self._pipe_configs[name] = pipe_config + return pipe, pipe_config["factory"] + def add_pipe( self, factory_name: str, @@ -612,8 +652,8 @@ class Language: after: Optional[Union[str, int]] = None, first: Optional[bool] = None, last: Optional[bool] = None, + source: Optional["Language"] = None, config: Optional[Dict[str, Any]] = SimpleFrozenDict(), - overrides: Optional[Dict[str, Any]] = SimpleFrozenDict(), validate: bool = True, ) -> Callable[[Doc], Doc]: """Add a component to the processing pipeline. Valid components are @@ -631,10 +671,10 @@ class Language: component directly after. first (bool): If True, insert component first in the pipeline. last (bool): If True, insert component last in the pipeline. + source (Language): Optional loaded nlp object to copy the pipeline + component from. config (Optional[Dict[str, Any]]): Config parameters to use for this component. Will be merged with default config, if available. - overrides (Optional[Dict[str, Any]]): Config overrides, typically - passed in via the CLI. validate (bool): Whether to validate the component config against the arguments and types expected by the factory. RETURNS (Callable[[Doc], Doc]): The pipeline component. @@ -645,29 +685,30 @@ class Language: bad_val = repr(factory_name) err = Errors.E966.format(component=bad_val, name=name) raise ValueError(err) - if not self.has_factory(factory_name): - err = Errors.E002.format( - name=factory_name, - opts=", ".join(self.factory_names), - method="add_pipe", - lang=util.get_object_name(self), - lang_code=self.lang, - ) name = name if name is not None else factory_name if name in self.pipe_names: raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names)) - pipe_component = self.create_pipe( - factory_name, - name=name, - config=config, - overrides=overrides, - validate=validate, - ) + if source is not None: + # We're loading the component from a model. After loading the + # component, we know its real factory name + pipe_component, factory_name = self.create_pipe_from_source( + factory_name, source, name=name + ) + else: + if not self.has_factory(factory_name): + err = Errors.E002.format( + name=factory_name, + opts=", ".join(self.factory_names), + method="add_pipe", + lang=util.get_object_name(self), + lang_code=self.lang, + ) + pipe_component = self.create_pipe( + factory_name, name=name, config=config, validate=validate, + ) pipe_index = self._get_pipe_index(before, after, first, last) self._pipe_meta[name] = self.get_factory_meta(factory_name) self.pipeline.insert(pipe_index, (name, pipe_component)) - if ENABLE_PIPELINE_ANALYSIS: - analyze_pipes(self, name, pipe_index) return pipe_component def _get_pipe_index( @@ -754,12 +795,11 @@ class Language: # to Language.pipeline to make sure the configs are handled correctly pipe_index = self.pipe_names.index(name) self.remove_pipe(name) - if not len(self.pipeline): # we have no components to insert before/after + if not len(self.pipeline) or pipe_index == len(self.pipeline): + # we have no components to insert before/after, or we're replacing the last component self.add_pipe(factory_name, name=name) else: self.add_pipe(factory_name, name=name, before=pipe_index) - if ENABLE_PIPELINE_ANALYSIS: - analyze_all_pipes(self) def rename_pipe(self, old_name: str, new_name: str) -> None: """Rename a pipeline component. @@ -793,8 +833,6 @@ class Language: # because factory may be used for something else self._pipe_meta.pop(name) self._pipe_configs.pop(name) - if ENABLE_PIPELINE_ANALYSIS: - analyze_all_pipes(self) return removed def __call__( @@ -900,6 +938,7 @@ class Language: sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, + exclude: Iterable[str] = tuple(), ): """Update the models in the pipeline. @@ -910,6 +949,7 @@ class Language: losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. component_cfg (Dict[str, Dict]): Config parameters for specific pipeline components, keyed by component name. + exclude (Iterable[str]): Names of components that shouldn't be updated. RETURNS (Dict[str, float]): The updated losses dictionary DOCS: https://spacy.io/api/language#update @@ -942,12 +982,12 @@ class Language: component_cfg[name].setdefault("drop", drop) component_cfg[name].setdefault("set_annotations", False) for name, proc in self.pipeline: - if not hasattr(proc, "update"): + if name in exclude or not hasattr(proc, "update"): continue proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) if sgd not in (None, False): for name, proc in self.pipeline: - if hasattr(proc, "model"): + if name not in exclude and hasattr(proc, "model"): proc.model.finish_update(sgd) return losses @@ -958,6 +998,7 @@ class Language: sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, + exclude: Iterable[str] = tuple(), ) -> Dict[str, float]: """Make a "rehearsal" update to the models in the pipeline, to prevent forgetting. Rehearsal updates run an initial copy of the model over some @@ -969,6 +1010,7 @@ class Language: sgd (Optional[Optimizer]): An optimizer. component_cfg (Dict[str, Dict]): Config parameters for specific pipeline components, keyed by component name. + exclude (Iterable[str]): Names of components that shouldn't be updated. RETURNS (dict): Results from the update. EXAMPLE: @@ -1012,7 +1054,7 @@ class Language: get_grads.b1 = sgd.b1 get_grads.b2 = sgd.b2 for name, proc in pipes: - if not hasattr(proc, "rehearse"): + if name in exclude or not hasattr(proc, "rehearse"): continue grads = {} proc.rehearse( @@ -1063,7 +1105,7 @@ class Language: return self._optimizer def resume_training( - self, *, sgd: Optional[Optimizer] = None, device: int = -1 + self, *, sgd: Optional[Optimizer] = None, device: int = -1, ) -> Optimizer: """Continue training a pretrained model. @@ -1099,6 +1141,7 @@ class Language: batch_size: int = 256, scorer: Optional[Scorer] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, + scorer_cfg: Optional[Dict[str, Any]] = None, ) -> Dict[str, Union[float, dict]]: """Evaluate a model's pipeline components. @@ -1109,6 +1152,8 @@ class Language: will be created. component_cfg (dict): An optional dictionary with extra keyword arguments for specific components. + scorer_cfg (dict): An optional dictionary with extra keyword arguments + for the scorer. RETURNS (Scorer): The scorer containing the evaluation results. DOCS: https://spacy.io/api/language#evaluate @@ -1126,8 +1171,10 @@ class Language: raise TypeError(err) if component_cfg is None: component_cfg = {} + if scorer_cfg is None: + scorer_cfg = {} if scorer is None: - kwargs = component_cfg.get("scorer", {}) + kwargs = dict(scorer_cfg) kwargs.setdefault("verbose", verbose) kwargs.setdefault("nlp", self) scorer = Scorer(**kwargs) @@ -1136,9 +1183,9 @@ class Language: start_time = timer() # tokenize the texts only for timing purposes if not hasattr(self.tokenizer, "pipe"): - _ = [self.tokenizer(text) for text in texts] + _ = [self.tokenizer(text) for text in texts] # noqa: F841 else: - _ = list(self.tokenizer.pipe(texts)) + _ = list(self.tokenizer.pipe(texts)) # noqa: F841 for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) @@ -1357,8 +1404,8 @@ class Language: cls, config: Union[Dict[str, Any], Config] = {}, *, + vocab: Union[Vocab, bool] = True, disable: Iterable[str] = tuple(), - overrides: Dict[str, Any] = {}, auto_fill: bool = True, validate: bool = True, ) -> "Language": @@ -1367,6 +1414,7 @@ class Language: the default config of the given language is used. config (Dict[str, Any] / Config): The loaded config. + vocab (Vocab): A Vocab object. If True, a vocab is created. disable (Iterable[str]): List of pipeline component names to disable. auto_fill (bool): Automatically fill in missing values in config based on defaults and function argument annotations. @@ -1397,43 +1445,76 @@ class Language: config = util.copy_config(config) orig_pipeline = config.pop("components", {}) config["components"] = {} - non_pipe_overrides, pipe_overrides = _get_config_overrides(overrides) resolved, filled = registry.resolve( - config, validate=validate, schema=ConfigSchema, overrides=non_pipe_overrides + config, validate=validate, schema=ConfigSchema ) filled["components"] = orig_pipeline config["components"] = orig_pipeline create_tokenizer = resolved["nlp"]["tokenizer"] create_lemmatizer = resolved["nlp"]["lemmatizer"] - nlp = cls( - create_tokenizer=create_tokenizer, create_lemmatizer=create_lemmatizer, + before_creation = resolved["nlp"]["before_creation"] + after_creation = resolved["nlp"]["after_creation"] + after_pipeline_creation = resolved["nlp"]["after_pipeline_creation"] + lang_cls = cls + if before_creation is not None: + lang_cls = before_creation(cls) + if ( + not isinstance(lang_cls, type) + or not issubclass(lang_cls, cls) + or lang_cls is not cls + ): + raise ValueError(Errors.E943.format(value=type(lang_cls))) + nlp = lang_cls( + vocab=vocab, + create_tokenizer=create_tokenizer, + create_lemmatizer=create_lemmatizer, ) + if after_creation is not None: + nlp = after_creation(nlp) + if not isinstance(nlp, cls): + raise ValueError(Errors.E942.format(name="creation", value=type(nlp))) # Note that we don't load vectors here, instead they get loaded explicitly # inside stuff like the spacy train function. If we loaded them here, # then we would load them twice at runtime: once when we make from config, # and then again when we load from disk. pipeline = config.get("components", {}) + # If components are loaded from a source (existing models), we cache + # them here so they're only loaded once + source_nlps = {} for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) raise ValueError(Errors.E956.format(name=pipe_name, opts=opts)) pipe_cfg = util.copy_config(pipeline[pipe_name]) if pipe_name not in disable: - if "factory" not in pipe_cfg: + if "factory" not in pipe_cfg and "source" not in pipe_cfg: err = Errors.E984.format(name=pipe_name, config=pipe_cfg) raise ValueError(err) - factory = pipe_cfg.pop("factory") - # The pipe name (key in the config) here is the unique name of the - # component, not necessarily the factory - nlp.add_pipe( - factory, - name=pipe_name, - config=pipe_cfg, - overrides=pipe_overrides, - validate=validate, - ) + if "factory" in pipe_cfg: + factory = pipe_cfg.pop("factory") + # The pipe name (key in the config) here is the unique name + # of the component, not necessarily the factory + nlp.add_pipe( + factory, name=pipe_name, config=pipe_cfg, validate=validate, + ) + else: + model = pipe_cfg["source"] + if model not in source_nlps: + # We only need the components here and we need to init + # model with the same vocab as the current nlp object + source_nlps[model] = util.load_model( + model, vocab=nlp.vocab, disable=["vocab", "tokenizer"] + ) + source_name = pipe_cfg.get("component", pipe_name) + nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) nlp.config = filled if auto_fill else config nlp.resolved = resolved + if after_pipeline_creation is not None: + nlp = after_pipeline_creation(nlp) + if not isinstance(nlp, cls): + raise ValueError( + Errors.E942.format(name="pipeline_creation", value=type(nlp)) + ) return nlp def to_disk( @@ -1599,15 +1680,6 @@ class FactoryMeta: default_score_weights: Optional[Dict[str, float]] = None # noqa: E704 -def _get_config_overrides( - items: Dict[str, Any], prefix: str = "components" -) -> Tuple[Dict[str, Any], Dict[str, Any]]: - prefix = f"{prefix}." - non_pipe = {k: v for k, v in items.items() if not k.startswith(prefix)} - pipe = {k.replace(prefix, ""): v for k, v in items.items() if k.startswith(prefix)} - return non_pipe, pipe - - def _fix_pretrained_vectors_name(nlp: Language) -> None: # TODO: Replace this once we handle vectors consistently as static # data diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py index 5a8f28dfe..5a66a35bd 100644 --- a/spacy/ml/_biluo.py +++ b/spacy/ml/_biluo.py @@ -80,7 +80,7 @@ def _get_transition_table( B_start, B_end = (0, n_labels) I_start, I_end = (B_end, B_end + n_labels) L_start, L_end = (I_end, I_end + n_labels) - U_start, _ = (L_end, L_end + n_labels) + U_start, _ = (L_end, L_end + n_labels) # noqa: F841 # Using ranges allows us to set specific cells, which is necessary to express # that only actions of the same label are valid continuations. B_range = numpy.arange(B_start, B_end) diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py index ab0cb85c7..f5c539c42 100644 --- a/spacy/ml/_character_embed.py +++ b/spacy/ml/_character_embed.py @@ -1,6 +1,7 @@ from typing import List from thinc.api import Model from thinc.types import Floats2d + from ..tokens import Doc @@ -15,14 +16,14 @@ def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]: ) -def init(model, X=None, Y=None): +def init(model: Model, X=None, Y=None): vectors_table = model.ops.alloc3f( model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM") ) model.set_param("E", vectors_table) -def forward(model, docs, is_train): +def forward(model: Model, docs: List[Doc], is_train: bool): if docs is None: return [] ids = [] diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py index 9f385ec0d..4dbc79f52 100644 --- a/spacy/ml/_iob.py +++ b/spacy/ml/_iob.py @@ -14,7 +14,7 @@ def IOB() -> Model[Padded, Padded]: ) -def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): +def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None: if X is not None and Y is not None: if X.data.shape != Y.data.shape: # TODO: Fix error diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py index f9f691aae..bdc297232 100644 --- a/spacy/ml/extract_ngrams.py +++ b/spacy/ml/extract_ngrams.py @@ -4,14 +4,14 @@ from thinc.api import Model from ..attrs import LOWER -def extract_ngrams(ngram_size, attr=LOWER) -> Model: +def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model: model = Model("extract_ngrams", forward) model.attrs["ngram_size"] = ngram_size model.attrs["attr"] = attr return model -def forward(model, docs, is_train: bool): +def forward(model: Model, docs, is_train: bool): batch_keys = [] batch_vals = [] for doc in docs: diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index ffd6c3c1c..f96d50a7b 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,5 +1,4 @@ -from pathlib import Path - +from typing import Optional from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear @@ -9,7 +8,7 @@ from ...vocab import Vocab @registry.architectures.register("spacy.EntityLinker.v1") -def build_nel_encoder(tok2vec, nO=None): +def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: with Model.define_operators({">>": chain, "**": clone}): token_width = tok2vec.get_dim("nO") output_layer = Linear(nO=nO, nI=token_width) @@ -26,8 +25,15 @@ def build_nel_encoder(tok2vec, nO=None): @registry.assets.register("spacy.KBFromFile.v1") -def load_kb(vocab_path, kb_path) -> KnowledgeBase: +def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase: vocab = Vocab().from_disk(vocab_path) - kb = KnowledgeBase(vocab=vocab) + kb = KnowledgeBase(entity_vector_length=1) + kb.initialize(vocab) kb.load_bulk(kb_path) return kb + + +@registry.assets.register("spacy.EmptyKB.v1") +def empty_kb(entity_vector_length: int) -> KnowledgeBase: + kb = KnowledgeBase(entity_vector_length=entity_vector_length) + return kb diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index ed85b1a91..ac990c015 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -1,10 +1,20 @@ +from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING import numpy - from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model from thinc.api import MultiSoftmax, list2array +if TYPE_CHECKING: + # This lets us add type hints for mypy etc. without causing circular imports + from ...vocab import Vocab # noqa: F401 + from ...tokens import Doc # noqa: F401 -def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): + +def build_multi_task_model( + tok2vec: Model, + maxout_pieces: int, + token_vector_width: int, + nO: Optional[int] = None, +) -> Model: softmax = Softmax(nO=nO, nI=token_vector_width * 2) model = chain( tok2vec, @@ -22,7 +32,13 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): return model -def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=None): +def build_cloze_multi_task_model( + vocab: "Vocab", + tok2vec: Model, + maxout_pieces: int, + hidden_size: int, + nO: Optional[int] = None, +) -> Model: # nO = vocab.vectors.data.shape[1] output_layer = chain( list2array(), @@ -43,24 +59,24 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO= def build_cloze_characters_multi_task_model( - vocab, tok2vec, maxout_pieces, hidden_size, nr_char -): + vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int +) -> Model: output_layer = chain( list2array(), Maxout(hidden_size, nP=maxout_pieces), LayerNorm(nI=hidden_size), MultiSoftmax([256] * nr_char, nI=hidden_size), ) - model = build_masked_language_model(vocab, chain(tok2vec, output_layer)) model.set_ref("tok2vec", tok2vec) model.set_ref("output_layer", output_layer) return model -def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): +def build_masked_language_model( + vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15 +) -> Model: """Convert a model into a BERT-style masked language model""" - random_words = _RandomWords(vocab) def mlm_forward(model, docs, is_train): @@ -74,7 +90,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): return output, mlm_backward - def mlm_initialize(model, X=None, Y=None): + def mlm_initialize(model: Model, X=None, Y=None): wrapped = model.layers[0] wrapped.initialize(X=X, Y=Y) for dim in wrapped.dim_names: @@ -90,12 +106,11 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): dims={dim: None for dim in wrapped_model.dim_names}, ) mlm_model.set_ref("wrapped", wrapped_model) - return mlm_model class _RandomWords: - def __init__(self, vocab): + def __init__(self, vocab: "Vocab") -> None: self.words = [lex.text for lex in vocab if lex.prob != 0.0] self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] self.words = self.words[:10000] @@ -104,7 +119,7 @@ class _RandomWords: self.probs /= self.probs.sum() self._cache = [] - def next(self): + def next(self) -> str: if not self._cache: self._cache.extend( numpy.random.choice(len(self.words), 10000, p=self.probs) @@ -113,9 +128,11 @@ class _RandomWords: return self.words[index] -def _apply_mask(docs, random_words, mask_prob=0.15): +def _apply_mask( + docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15 +) -> Tuple[numpy.ndarray, List["Doc"]]: # This needs to be here to avoid circular imports - from ...tokens import Doc + from ...tokens import Doc # noqa: F811 N = sum(len(doc) for doc in docs) mask = numpy.random.uniform(0.0, 1.0, (N,)) @@ -141,7 +158,7 @@ def _apply_mask(docs, random_words, mask_prob=0.15): return mask, masked_docs -def _replace_word(word, random_words, mask="[MASK]"): +def _replace_word(word: str, random_words: _RandomWords, mask: str = "[MASK]") -> str: roll = numpy.random.random() if roll < 0.8: return mask diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index c1e530d4a..429ceff28 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -1,6 +1,5 @@ -from pydantic import StrictInt -from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array -from thinc.api import LayerNorm, Maxout, Mish +from typing import Optional +from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from ...util import registry from .._precomputable_affine import PrecomputableAffine @@ -10,16 +9,15 @@ from ..tb_framework import TransitionModel @registry.architectures.register("spacy.TransitionBasedParser.v1") def build_tb_parser_model( tok2vec: Model, - nr_feature_tokens: StrictInt, - hidden_width: StrictInt, - maxout_pieces: StrictInt, - use_upper=True, - nO=None, -): + nr_feature_tokens: int, + hidden_width: int, + maxout_pieces: int, + use_upper: bool = True, + nO: Optional[int] = None, +) -> Model: t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) tok2vec.set_dim("nO", hidden_width) - lower = PrecomputableAffine( nO=hidden_width if use_upper else nO, nF=nr_feature_tokens, diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py index 1fb5a71c0..b2934dadc 100644 --- a/spacy/ml/models/simple_ner.py +++ b/spacy/ml/models/simple_ner.py @@ -26,7 +26,6 @@ def BiluoTagger( with_array(softmax_activation()), padded2list(), ) - return Model( "biluo-tagger", forward, @@ -52,7 +51,6 @@ def IOBTagger( with_array(softmax_activation()), padded2list(), ) - return Model( "iob-tagger", forward, diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 7fe417321..78637e8b5 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -1,10 +1,11 @@ +from typing import Optional from thinc.api import zero_init, with_array, Softmax, chain, Model from ...util import registry @registry.architectures.register("spacy.Tagger.v1") -def build_tagger_model(tok2vec, nO=None) -> Model: +def build_tagger_model(tok2vec: Model, nO: Optional[int] = None) -> Model: # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None output_layer = Softmax(nO, t2v_width, init_W=zero_init) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 53200c165..0a25699dc 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -2,10 +2,9 @@ from typing import Optional from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum -from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued +from thinc.api import HashEmbed, with_array, with_cpu, uniqued from thinc.api import Relu, residual, expand_window, FeatureExtractor -from ... import util from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry from ..extract_ngrams import extract_ngrams @@ -40,7 +39,12 @@ def build_simple_cnn_text_classifier( @registry.architectures.register("spacy.TextCatBOW.v1") -def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None): +def build_bow_text_classifier( + exclusive_classes: bool, + ngram_size: int, + no_output_layer: bool, + nO: Optional[int] = None, +) -> Model: with Model.define_operators({">>": chain}): sparse_linear = SparseLinear(nO) model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear @@ -55,16 +59,16 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO @registry.architectures.register("spacy.TextCatEnsemble.v1") def build_text_classifier( - width, - embed_size, - pretrained_vectors, - exclusive_classes, - ngram_size, - window_size, - conv_depth, - dropout, - nO=None, -): + width: int, + embed_size: int, + pretrained_vectors: Optional[bool], + exclusive_classes: bool, + ngram_size: int, + window_size: int, + conv_depth: int, + dropout: Optional[float], + nO: Optional[int] = None, +) -> Model: cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): lower = HashEmbed( @@ -91,7 +95,6 @@ def build_text_classifier( dropout=dropout, seed=13, ) - width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( uniqued( @@ -100,7 +103,6 @@ def build_text_classifier( column=cols.index(ORTH), ) ) - if pretrained_vectors: static_vectors = StaticVectors(width) vector_layer = trained_vectors | static_vectors @@ -152,7 +154,12 @@ def build_text_classifier( @registry.architectures.register("spacy.TextCatLowData.v1") -def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None): +def build_text_classifier_lowdata( + width: int, + pretrained_vectors: Optional[bool], + dropout: Optional[float], + nO: Optional[int] = None, +) -> Model: # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" with Model.define_operators({">>": chain, "**": clone}): model = ( diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 1460b3005..474942558 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -6,16 +6,15 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM from thinc.types import Floats2d from ...tokens import Doc -from ... import util from ...util import registry from ...ml import _character_embed from ..staticvectors import StaticVectors from ...pipeline.tok2vec import Tok2VecListener -from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE +from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE @registry.architectures.register("spacy.Tok2VecListener.v1") -def tok2vec_listener_v1(width, upstream="*"): +def tok2vec_listener_v1(width: int, upstream: str = "*"): tok2vec = Tok2VecListener(upstream_name=upstream, width=width) return tok2vec @@ -45,10 +44,11 @@ def build_hash_embed_cnn_tok2vec( width=width, depth=depth, window_size=window_size, - maxout_pieces=maxout_pieces - ) + maxout_pieces=maxout_pieces, + ), ) + @registry.architectures.register("spacy.Tok2Vec.v1") def build_Tok2Vec_model( embed: Model[List[Doc], List[Floats2d]], @@ -68,7 +68,6 @@ def MultiHashEmbed( width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool ): cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH] - seed = 7 def make_hash_embed(feature): @@ -124,11 +123,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int): chain( FeatureExtractor([NORM]), list2ragged(), - with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)) - ) + with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), + ), ), with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), - ragged2list() + ragged2list(), ) return model @@ -155,12 +154,7 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: def MishWindowEncoder(width, window_size, depth): cnn = chain( expand_window(window_size=window_size), - Mish( - nO=width, - nI=width * ((window_size * 2) + 1), - dropout=0.0, - normalize=True - ), + Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True), ) model = clone(residual(cnn), depth) model.set_dim("nO", width) diff --git a/spacy/syntax/_parser_model.pxd b/spacy/ml/parser_model.pxd similarity index 88% rename from spacy/syntax/_parser_model.pxd rename to spacy/ml/parser_model.pxd index 15befb372..6582b3468 100644 --- a/spacy/syntax/_parser_model.pxd +++ b/spacy/ml/parser_model.pxd @@ -1,8 +1,6 @@ from libc.string cimport memset, memcpy -from libc.stdlib cimport calloc, free, realloc -from ..typedefs cimport weight_t, class_t, hash_t - -from ._state cimport StateC +from ..typedefs cimport weight_t, hash_t +from ..pipeline._parser_internals._state cimport StateC cdef struct SizesC: diff --git a/spacy/syntax/_parser_model.pyx b/spacy/ml/parser_model.pyx similarity index 97% rename from spacy/syntax/_parser_model.pyx rename to spacy/ml/parser_model.pyx index eedd84bac..da937ca4f 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/ml/parser_model.pyx @@ -1,29 +1,18 @@ # cython: infer_types=True, cdivision=True, boundscheck=False -cimport cython.parallel cimport numpy as np from libc.math cimport exp -from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free, realloc -from cymem.cymem cimport Pool -from thinc.extra.search cimport Beam from thinc.backends.linalg cimport Vec, VecVec cimport blis.cy import numpy import numpy.random -from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop +from thinc.api import Model, CupyOps, NumpyOps -from ..typedefs cimport weight_t, class_t, hash_t -from ..tokens.doc cimport Doc -from .stateclass cimport StateClass -from .transition_system cimport Transition - -from ..compat import copy_array -from ..errors import Errors, TempErrors -from ..util import create_default_optimizer from .. import util -from . import nonproj +from ..typedefs cimport weight_t, class_t, hash_t +from ..pipeline._parser_internals.stateclass cimport StateClass cdef WeightsC get_c_weights(model) except *: diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 39d4b0a14..44f125a04 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,5 +1,5 @@ from thinc.api import Model, noop, use_ops, Linear -from ..syntax._parser_model import ParserStepModel +from .parser_model import ParserStepModel def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()): diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index b57f1524b..008ac3384 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -1,9 +1,8 @@ from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING -from wasabi import Printer -import warnings +from wasabi import msg from .tokens import Doc, Token, Span -from .errors import Errors, Warnings +from .errors import Errors from .util import dot_to_dict if TYPE_CHECKING: @@ -11,48 +10,7 @@ if TYPE_CHECKING: from .language import Language # noqa: F401 -def analyze_pipes( - nlp: "Language", name: str, index: int, warn: bool = True -) -> List[str]: - """Analyze a pipeline component with respect to its position in the current - pipeline and the other components. Will check whether requirements are - fulfilled (e.g. if previous components assign the attributes). - - nlp (Language): The current nlp object. - name (str): The name of the pipeline component to analyze. - index (int): The index of the component in the pipeline. - warn (bool): Show user warning if problem is found. - RETURNS (List[str]): The problems found for the given pipeline component. - """ - assert nlp.pipeline[index][0] == name - prev_pipes = nlp.pipeline[:index] - meta = nlp.get_pipe_meta(name) - requires = {annot: False for annot in meta.requires} - if requires: - for prev_name, prev_pipe in prev_pipes: - prev_meta = nlp.get_pipe_meta(prev_name) - for annot in prev_meta.assigns: - requires[annot] = True - problems = [] - for annot, fulfilled in requires.items(): - if not fulfilled: - problems.append(annot) - if warn: - warnings.warn(Warnings.W025.format(name=name, attr=annot)) - return problems - - -def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]: - """Analyze all pipes in the pipeline in order. - - nlp (Language): The current nlp object. - warn (bool): Show user warning if problem is found. - RETURNS (Dict[str, List[str]]): The problems found, keyed by component name. - """ - problems = {} - for i, name in enumerate(nlp.pipe_names): - problems[name] = analyze_pipes(nlp, name, i, warn=warn) - return problems +DEFAULT_KEYS = ["requires", "assigns", "scores", "retokenizes"] def validate_attrs(values: Iterable[str]) -> Iterable[str]: @@ -101,89 +59,77 @@ def validate_attrs(values: Iterable[str]) -> Iterable[str]: return values -def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]: - assert feature in ["assigns", "requires"] - result = [] +def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]: + """Check which components in the pipeline assign or require an attribute. + + nlp (Language): The current nlp object. + attr (str): The attribute, e.g. "doc.tensor". + RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires", + mapped to a list of component names. + """ + result = {"assigns": [], "requires": []} for pipe_name in nlp.pipe_names: meta = nlp.get_pipe_meta(pipe_name) - pipe_assigns = getattr(meta, feature, []) - if attr in pipe_assigns: - result.append(pipe_name) + if attr in meta.assigns: + result["assigns"].append(pipe_name) + if attr in meta.requires: + result["requires"].append(pipe_name) return result -def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]: - """Get all pipeline components that assign an attr, e.g. "doc.tensor". - - pipeline (Language): The current nlp object. - attr (str): The attribute to check. - RETURNS (List[str]): Names of components that require the attr. - """ - return _get_feature_for_attr(nlp, attr, "assigns") - - -def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]: - """Get all pipeline components that require an attr, e.g. "doc.tensor". - - pipeline (Language): The current nlp object. - attr (str): The attribute to check. - RETURNS (List[str]): Names of components that require the attr. - """ - return _get_feature_for_attr(nlp, attr, "requires") - - -def print_summary( - nlp: "Language", pretty: bool = True, no_print: bool = False -) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]: +def analyze_pipes( + nlp: "Language", *, keys: List[str] = DEFAULT_KEYS, +) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Print a formatted summary for the current nlp object's pipeline. Shows a table with the pipeline components and why they assign and require, as well as any problems if available. nlp (Language): The nlp object. - pretty (bool): Pretty-print the results (color etc). - no_print (bool): Don't print anything, just return the data. - RETURNS (dict): A dict with "overview" and "problems". + keys (List[str]): The meta keys to show in the table. + RETURNS (dict): A dict with "summary" and "problems". """ - msg = Printer(pretty=pretty, no_print=no_print) - overview = [] - problems = {} + result = {"summary": {}, "problems": {}} + all_attrs = set() for i, name in enumerate(nlp.pipe_names): meta = nlp.get_pipe_meta(name) - overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes)) - problems[name] = analyze_pipes(nlp, name, i, warn=False) + all_attrs.update(meta.assigns) + all_attrs.update(meta.requires) + result["summary"][name] = {key: getattr(meta, key, None) for key in keys} + prev_pipes = nlp.pipeline[:i] + requires = {annot: False for annot in meta.requires} + if requires: + for prev_name, prev_pipe in prev_pipes: + prev_meta = nlp.get_pipe_meta(prev_name) + for annot in prev_meta.assigns: + requires[annot] = True + result["problems"][name] = [] + for annot, fulfilled in requires.items(): + if not fulfilled: + result["problems"][name].append(annot) + result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs} + return result + + +def print_pipe_analysis( + analysis: Dict[str, Union[List[str], Dict[str, List[str]]]], + *, + keys: List[str] = DEFAULT_KEYS, +) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]: + """Print a formatted version of the pipe analysis produced by analyze_pipes. + + analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis. + keys (List[str]): The meta keys to show in the table. + """ msg.divider("Pipeline Overview") - header = ("#", "Component", "Requires", "Assigns", "Retokenizes") - msg.table(overview, header=header, divider=True, multiline=True) - n_problems = sum(len(p) for p in problems.values()) - if any(p for p in problems.values()): + header = ["#", "Component", *[key.capitalize() for key in keys]] + summary = analysis["summary"].items() + body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)] + msg.table(body, header=header, divider=True, multiline=True) + n_problems = sum(len(p) for p in analysis["problems"].values()) + if any(p for p in analysis["problems"].values()): msg.divider(f"Problems ({n_problems})") - for name, problem in problems.items(): + for name, problem in analysis["problems"].items(): if problem: msg.warn(f"'{name}' requirements not met: {', '.join(problem)}") else: msg.good("No problems found.") - if no_print: - return {"overview": overview, "problems": problems} - - -def count_pipeline_interdependencies(nlp: "Language") -> List[int]: - """Count how many subsequent components require an annotation set by each - component in the pipeline. - - nlp (Language): The current nlp object. - RETURNS (List[int]): The interdependency counts. - """ - pipe_assigns = [] - pipe_requires = [] - for name in nlp.pipe_names: - meta = nlp.get_pipe_meta(name) - pipe_assigns.append(set(meta.assigns)) - pipe_requires.append(set(meta.requires)) - counts = [] - for i, assigns in enumerate(pipe_assigns): - count = 0 - for requires in pipe_requires[i + 1 :]: - if assigns.intersection(requires): - count += 1 - counts.append(count) - return counts diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index f8accd14f..7f395b5f2 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,3 +1,4 @@ +from .attributeruler import AttributeRuler from .dep_parser import DependencyParser from .entity_linker import EntityLinker from .ner import EntityRecognizer @@ -13,6 +14,7 @@ from .tok2vec import Tok2Vec from .functions import merge_entities, merge_noun_chunks, merge_subtokens __all__ = [ + "AttributeRuler", "DependencyParser", "EntityLinker", "EntityRecognizer", diff --git a/spacy/syntax/__init__.py b/spacy/pipeline/_parser_internals/__init__.py similarity index 100% rename from spacy/syntax/__init__.py rename to spacy/pipeline/_parser_internals/__init__.py diff --git a/spacy/syntax/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd similarity index 98% rename from spacy/syntax/_state.pxd rename to spacy/pipeline/_parser_internals/_state.pxd index fef4f0c92..0d0dd8c05 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -1,15 +1,14 @@ -from libc.string cimport memcpy, memset, memmove -from libc.stdlib cimport malloc, calloc, free +from libc.string cimport memcpy, memset +from libc.stdlib cimport calloc, free from libc.stdint cimport uint32_t, uint64_t from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from murmurhash.mrmr cimport hash64 -from ..vocab cimport EMPTY_LEXEME -from ..structs cimport TokenC, SpanC -from ..lexeme cimport Lexeme -from ..symbols cimport punct -from ..attrs cimport IS_SPACE -from ..typedefs cimport attr_t +from ...vocab cimport EMPTY_LEXEME +from ...structs cimport TokenC, SpanC +from ...lexeme cimport Lexeme +from ...attrs cimport IS_SPACE +from ...typedefs cimport attr_t cdef inline bint is_space_token(const TokenC* token) nogil: diff --git a/spacy/syntax/_state.pyx b/spacy/pipeline/_parser_internals/_state.pyx similarity index 100% rename from spacy/syntax/_state.pyx rename to spacy/pipeline/_parser_internals/_state.pyx diff --git a/spacy/syntax/arc_eager.pxd b/spacy/pipeline/_parser_internals/arc_eager.pxd similarity index 65% rename from spacy/syntax/arc_eager.pxd rename to spacy/pipeline/_parser_internals/arc_eager.pxd index a59be716a..e05a34f56 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/pipeline/_parser_internals/arc_eager.pxd @@ -1,8 +1,6 @@ -from cymem.cymem cimport Pool - from .stateclass cimport StateClass -from ..typedefs cimport weight_t, attr_t -from .transition_system cimport TransitionSystem, Transition +from ...typedefs cimport weight_t, attr_t +from .transition_system cimport Transition, TransitionSystem cdef class ArcEager(TransitionSystem): diff --git a/spacy/syntax/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx similarity index 98% rename from spacy/syntax/arc_eager.pyx rename to spacy/pipeline/_parser_internals/arc_eager.pyx index 6e63859f0..7db8aae0f 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -1,24 +1,17 @@ # cython: profile=True, cdivision=True, infer_types=True -from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool, Address from libc.stdint cimport int32_t from collections import defaultdict, Counter -import json -from ..typedefs cimport hash_t, attr_t -from ..strings cimport hash_string -from ..structs cimport TokenC -from ..tokens.doc cimport Doc, set_children_from_heads +from ...typedefs cimport hash_t, attr_t +from ...strings cimport hash_string +from ...structs cimport TokenC +from ...tokens.doc cimport Doc, set_children_from_heads +from ...gold.example cimport Example +from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC -from .transition_system cimport move_cost_func_t, label_cost_func_t -from ..gold.example cimport Example - -from ..errors import Errors -from .nonproj import is_nonproj_tree -from . import nonproj - # Calculate cost as gold/not gold. We don't use scalar value anyway. cdef int BINARY_COSTS = 1 diff --git a/spacy/syntax/ner.pxd b/spacy/pipeline/_parser_internals/ner.pxd similarity index 58% rename from spacy/syntax/ner.pxd rename to spacy/pipeline/_parser_internals/ner.pxd index 989593a92..2264a1518 100644 --- a/spacy/syntax/ner.pxd +++ b/spacy/pipeline/_parser_internals/ner.pxd @@ -1,6 +1,4 @@ from .transition_system cimport TransitionSystem -from .transition_system cimport Transition -from ..typedefs cimport attr_t cdef class BiluoPushDown(TransitionSystem): diff --git a/spacy/syntax/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx similarity index 98% rename from spacy/syntax/ner.pyx rename to spacy/pipeline/_parser_internals/ner.pyx index c4125bbdf..2570ccdee 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -2,17 +2,14 @@ from collections import Counter from libc.stdint cimport int32_t from cymem.cymem cimport Pool -from ..typedefs cimport weight_t +from ...typedefs cimport weight_t, attr_t +from ...lexeme cimport Lexeme +from ...attrs cimport IS_SPACE +from ...gold.example cimport Example +from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC -from .transition_system cimport Transition -from .transition_system cimport do_func_t -from ..lexeme cimport Lexeme -from ..attrs cimport IS_SPACE -from ..gold.iob_utils import biluo_tags_from_offsets -from ..gold.example cimport Example - -from ..errors import Errors +from .transition_system cimport Transition, do_func_t cdef enum: diff --git a/spacy/syntax/nonproj.pxd b/spacy/pipeline/_parser_internals/nonproj.pxd similarity index 100% rename from spacy/syntax/nonproj.pxd rename to spacy/pipeline/_parser_internals/nonproj.pxd diff --git a/spacy/syntax/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx similarity index 98% rename from spacy/syntax/nonproj.pyx rename to spacy/pipeline/_parser_internals/nonproj.pyx index 5ccb11f37..8f5fdaa71 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/pipeline/_parser_internals/nonproj.pyx @@ -5,9 +5,9 @@ scheme. """ from copy import copy -from ..tokens.doc cimport Doc, set_children_from_heads +from ...tokens.doc cimport Doc, set_children_from_heads -from ..errors import Errors +from ...errors import Errors DELIMITER = '||' diff --git a/spacy/syntax/stateclass.pxd b/spacy/pipeline/_parser_internals/stateclass.pxd similarity index 95% rename from spacy/syntax/stateclass.pxd rename to spacy/pipeline/_parser_internals/stateclass.pxd index 567982a3f..1d9f05538 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/pipeline/_parser_internals/stateclass.pxd @@ -1,12 +1,8 @@ -from libc.string cimport memcpy, memset - from cymem.cymem cimport Pool -cimport cython -from ..structs cimport TokenC, SpanC -from ..typedefs cimport attr_t +from ...structs cimport TokenC, SpanC +from ...typedefs cimport attr_t -from ..vocab cimport EMPTY_LEXEME from ._state cimport StateC diff --git a/spacy/syntax/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx similarity index 97% rename from spacy/syntax/stateclass.pyx rename to spacy/pipeline/_parser_internals/stateclass.pyx index e472e9861..880cf6cc5 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/pipeline/_parser_internals/stateclass.pyx @@ -1,7 +1,7 @@ # cython: infer_types=True import numpy -from ..tokens.doc cimport Doc +from ...tokens.doc cimport Doc cdef class StateClass: diff --git a/spacy/syntax/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd similarity index 91% rename from spacy/syntax/transition_system.pxd rename to spacy/pipeline/_parser_internals/transition_system.pxd index 836c08168..ba4c33814 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/pipeline/_parser_internals/transition_system.pxd @@ -1,11 +1,11 @@ from cymem.cymem cimport Pool -from ..typedefs cimport attr_t, weight_t -from ..structs cimport TokenC -from ..strings cimport StringStore +from ...typedefs cimport attr_t, weight_t +from ...structs cimport TokenC +from ...strings cimport StringStore +from ...gold.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC -from ..gold.example cimport Example cdef struct Transition: diff --git a/spacy/syntax/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx similarity index 97% rename from spacy/syntax/transition_system.pyx rename to spacy/pipeline/_parser_internals/transition_system.pyx index 17166dcf5..7694e7f34 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -1,19 +1,17 @@ # cython: infer_types=True from __future__ import print_function -from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from collections import Counter import srsly -from ..typedefs cimport weight_t -from ..tokens.doc cimport Doc -from ..structs cimport TokenC +from ...typedefs cimport weight_t, attr_t +from ...tokens.doc cimport Doc +from ...structs cimport TokenC from .stateclass cimport StateClass -from ..typedefs cimport attr_t -from ..errors import Errors -from .. import util +from ...errors import Errors +from ... import util cdef weight_t MIN_SCORE = -90000 diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py new file mode 100644 index 000000000..1f1e63959 --- /dev/null +++ b/spacy/pipeline/attributeruler.py @@ -0,0 +1,266 @@ +import srsly +from typing import List, Dict, Union, Iterable, Any, Optional +from pathlib import Path + +from .pipe import Pipe +from ..errors import Errors +from ..language import Language +from ..matcher import Matcher +from ..symbols import IDS +from ..tokens import Doc, Span +from ..tokens._retokenize import normalize_token_attrs, set_token_attrs +from ..vocab import Vocab +from .. import util + + +MatcherPatternType = List[Dict[Union[int, str], Any]] +AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]] + + +@Language.factory("attribute_ruler") +def make_attribute_ruler( + nlp: Language, + name: str, + pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None, +): + return AttributeRuler(nlp.vocab, name, pattern_dicts=pattern_dicts) + + +class AttributeRuler(Pipe): + """Set token-level attributes for tokens matched by Matcher patterns. + Additionally supports importing patterns from tag maps and morph rules. + + DOCS: https://spacy.io/api/attributeruler + """ + + def __init__( + self, + vocab: Vocab, + name: str = "attribute_ruler", + *, + pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None, + ) -> None: + """Initialize the AttributeRuler. + + vocab (Vocab): The vocab. + name (str): The pipe name. Defaults to "attribute_ruler". + pattern_dicts (Iterable[Dict]): A list of pattern dicts with the keys as + the arguments to AttributeRuler.add (`patterns`/`attrs`/`index`) to add + as patterns. + + RETURNS (AttributeRuler): The AttributeRuler component. + + DOCS: https://spacy.io/api/attributeruler#init + """ + self.name = name + self.vocab = vocab + self.matcher = Matcher(self.vocab) + self.attrs = [] + self._attrs_unnormed = [] # store for reference + self.indices = [] + + if pattern_dicts: + self.add_patterns(pattern_dicts) + + def __call__(self, doc: Doc) -> Doc: + """Apply the attributeruler to a Doc and set all attribute exceptions. + + doc (Doc): The document to process. + RETURNS (Doc): The processed Doc. + + DOCS: https://spacy.io/api/attributeruler#call + """ + matches = self.matcher(doc) + + for match_id, start, end in matches: + span = Span(doc, start, end, label=match_id) + attrs = self.attrs[span.label] + index = self.indices[span.label] + try: + token = span[index] + except IndexError: + raise ValueError( + Errors.E1001.format( + patterns=self.matcher.get(span.label), + span=[t.text for t in span], + index=index, + ) + ) + set_token_attrs(token, attrs) + return doc + + def load_from_tag_map( + self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]] + ) -> None: + for tag, attrs in tag_map.items(): + pattern = [{"TAG": tag}] + attrs, morph_attrs = _split_morph_attrs(attrs) + morph = self.vocab.morphology.add(morph_attrs) + attrs["MORPH"] = self.vocab.strings[morph] + self.add([pattern], attrs) + + def load_from_morph_rules( + self, morph_rules: Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] + ) -> None: + for tag in morph_rules: + for word in morph_rules[tag]: + pattern = [{"ORTH": word, "TAG": tag}] + attrs = morph_rules[tag][word] + attrs, morph_attrs = _split_morph_attrs(attrs) + morph = self.vocab.morphology.add(morph_attrs) + attrs["MORPH"] = self.vocab.strings[morph] + self.add([pattern], attrs) + + def add( + self, patterns: Iterable[MatcherPatternType], attrs: Dict, index: int = 0 + ) -> None: + """Add Matcher patterns for tokens that should be modified with the + provided attributes. The token at the specified index within the + matched span will be assigned the attributes. + + patterns (Iterable[List[Dict]]): A list of Matcher patterns. + attrs (Dict): The attributes to assign to the target token in the + matched span. + index (int): The index of the token in the matched span to modify. May + be negative to index from the end of the span. Defaults to 0. + + DOCS: https://spacy.io/api/attributeruler#add + """ + self.matcher.add(len(self.attrs), patterns) + self._attrs_unnormed.append(attrs) + attrs = normalize_token_attrs(self.vocab, attrs) + self.attrs.append(attrs) + self.indices.append(index) + + def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None: + for p in pattern_dicts: + self.add(**p) + + @property + def patterns(self) -> List[AttributeRulerPatternType]: + all_patterns = [] + for i in range(len(self.attrs)): + p = {} + p["patterns"] = self.matcher.get(i)[1] + p["attrs"] = self._attrs_unnormed[i] + p["index"] = self.indices[i] + all_patterns.append(p) + return all_patterns + + def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + """Serialize the attributeruler to a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. + + DOCS: https://spacy.io/api/attributeruler#to_bytes + """ + serialize = {} + serialize["vocab"] = self.vocab.to_bytes + patterns = {k: self.matcher.get(k)[1] for k in range(len(self.attrs))} + serialize["patterns"] = lambda: srsly.msgpack_dumps(patterns) + serialize["attrs"] = lambda: srsly.msgpack_dumps(self.attrs) + serialize["indices"] = lambda: srsly.msgpack_dumps(self.indices) + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()): + """Load the attributeruler from a bytestring. + + bytes_data (bytes): The data to load. + exclude (Iterable[str]): String names of serialization fields to exclude. + returns (AttributeRuler): The loaded object. + + DOCS: https://spacy.io/api/attributeruler#from_bytes + """ + data = {"patterns": b""} + + def load_patterns(b): + data["patterns"] = srsly.msgpack_loads(b) + + def load_attrs(b): + self.attrs = srsly.msgpack_loads(b) + + def load_indices(b): + self.indices = srsly.msgpack_loads(b) + + deserialize = { + "vocab": lambda b: self.vocab.from_bytes(b), + "patterns": load_patterns, + "attrs": load_attrs, + "indices": load_indices, + } + util.from_bytes(bytes_data, deserialize, exclude) + + if data["patterns"]: + for key, pattern in data["patterns"].items(): + self.matcher.add(key, pattern) + assert len(self.attrs) == len(data["patterns"]) + assert len(self.indices) == len(data["patterns"]) + + return self + + def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()) -> None: + """Serialize the attributeruler to disk. + + path (Union[Path, str]): A path to a directory. + exclude (Iterable[str]): String names of serialization fields to exclude. + DOCS: https://spacy.io/api/attributeruler#to_disk + """ + patterns = {k: self.matcher.get(k)[1] for k in range(len(self.attrs))} + serialize = { + "vocab": lambda p: self.vocab.to_disk(p), + "patterns": lambda p: srsly.write_msgpack(p, patterns), + "attrs": lambda p: srsly.write_msgpack(p, self.attrs), + "indices": lambda p: srsly.write_msgpack(p, self.indices), + } + util.to_disk(path, serialize, exclude) + + def from_disk( + self, path: Union[Path, str], exclude: Iterable[str] = tuple() + ) -> None: + """Load the attributeruler from disk. + + path (Union[Path, str]): A path to a directory. + exclude (Iterable[str]): String names of serialization fields to exclude. + DOCS: https://spacy.io/api/attributeruler#from_disk + """ + data = {"patterns": b""} + + def load_patterns(p): + data["patterns"] = srsly.read_msgpack(p) + + def load_attrs(p): + self.attrs = srsly.read_msgpack(p) + + def load_indices(p): + self.indices = srsly.read_msgpack(p) + + deserialize = { + "vocab": lambda p: self.vocab.from_disk(p), + "patterns": load_patterns, + "attrs": load_attrs, + "indices": load_indices, + } + util.from_disk(path, deserialize, exclude) + + if data["patterns"]: + for key, pattern in data["patterns"].items(): + self.matcher.add(key, pattern) + assert len(self.attrs) == len(data["patterns"]) + assert len(self.indices) == len(data["patterns"]) + + return self + + +def _split_morph_attrs(attrs): + """Split entries from a tag map or morph rules dict into to two dicts, one + with the token-level features (POS, LEMMA) and one with the remaining + features, which are presumed to be individual MORPH features.""" + other_attrs = {} + morph_attrs = {} + for k, v in attrs.items(): + if k in "_" or k in IDS.keys() or k in IDS.values(): + other_attrs[k] = v + else: + morph_attrs[k] = v + return other_attrs, morph_attrs diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index a952385b4..a022d04d6 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -1,13 +1,13 @@ # cython: infer_types=True, profile=True, binding=True from typing import Optional, Iterable -from thinc.api import CosineDistance, to_categorical, get_array_module, Model, Config +from thinc.api import Model, Config -from ..syntax.nn_parser cimport Parser -from ..syntax.arc_eager cimport ArcEager +from .transition_parser cimport Parser +from ._parser_internals.arc_eager cimport ArcEager from .functions import merge_subtokens from ..language import Language -from ..syntax import nonproj +from ._parser_internals import nonproj from ..scorer import Scorer @@ -34,7 +34,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "parser", - assigns=["token.dep", "token.is_sent_start", "doc.sents"], + assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], default_config={ "moves": None, "update_with_oracle_cut_size": 100, @@ -120,7 +120,8 @@ cdef class DependencyParser(Parser): return dep results = {} results.update(Scorer.score_spans(examples, "sents", **kwargs)) - results.update(Scorer.score_deps(examples, "dep", getter=dep_getter, - ignore_labels=("p", "punct"), **kwargs)) + kwargs.setdefault("getter", dep_getter) + kwargs.setdefault("ignore_label", ("p", "punct")) + results.update(Scorer.score_deps(examples, "dep", **kwargs)) del results["sents_per_type"] return results diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index cc4e7b159..923d925dc 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -33,24 +33,31 @@ dropout = null """ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] +default_kb_config = """ +[kb] +@assets = "spacy.EmptyKB.v1" +entity_vector_length = 64 +""" +DEFAULT_NEL_KB = Config().from_str(default_kb_config)["kb"] + @Language.factory( "entity_linker", requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], assigns=["token.ent_kb_id"], default_config={ - "kb": None, # TODO - what kind of default makes sense here? + "kb": DEFAULT_NEL_KB, + "model": DEFAULT_NEL_MODEL, "labels_discard": [], "incl_prior": True, "incl_context": True, - "model": DEFAULT_NEL_MODEL, }, ) def make_entity_linker( nlp: Language, name: str, model: Model, - kb: Optional[KnowledgeBase], + kb: KnowledgeBase, *, labels_discard: Iterable[str], incl_prior: bool, @@ -92,10 +99,10 @@ class EntityLinker(Pipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - kb (KnowledgeBase): TODO: - labels_discard (Iterable[str]): TODO: - incl_prior (bool): TODO: - incl_context (bool): TODO: + kb (KnowledgeBase): The KnowledgeBase holding all entities and their aliases. + labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. + incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. + incl_context (bool): Whether or not to include the local context in the model. DOCS: https://spacy.io/api/entitylinker#init """ @@ -108,14 +115,12 @@ class EntityLinker(Pipe): "incl_prior": incl_prior, "incl_context": incl_context, } - self.kb = kb - if self.kb is None: - # create an empty KB that should be filled by calling from_disk - self.kb = KnowledgeBase(vocab=vocab) - else: - del cfg["kb"] # we don't want to duplicate its serialization - if not isinstance(self.kb, KnowledgeBase): + if not isinstance(kb, KnowledgeBase): raise ValueError(Errors.E990.format(type=type(self.kb))) + kb.initialize(vocab) + self.kb = kb + if "kb" in cfg: + del cfg["kb"] # we don't want to duplicate its serialization self.cfg = dict(cfg) self.distance = CosineDistance(normalize=False) # how many neightbour sentences to take into account @@ -222,9 +227,9 @@ class EntityLinker(Pipe): set_dropout_rate(self.model, drop) if not sentence_docs: warnings.warn(Warnings.W093.format(name="Entity Linker")) - return 0.0 + return losses sentence_encodings, bp_context = self.model.begin_update(sentence_docs) - loss, d_scores = self.get_similarity_loss( + loss, d_scores = self.get_loss( sentence_encodings=sentence_encodings, examples=examples ) bp_context(d_scores) @@ -235,7 +240,7 @@ class EntityLinker(Pipe): self.set_annotations(docs, predictions) return losses - def get_similarity_loss(self, examples: Iterable[Example], sentence_encodings): + def get_loss(self, examples: Iterable[Example], sentence_encodings): entity_encodings = [] for eg in examples: kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) @@ -247,7 +252,7 @@ class EntityLinker(Pipe): entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") if sentence_encodings.shape != entity_encodings.shape: err = Errors.E147.format( - method="get_similarity_loss", msg="gold entities do not match up" + method="get_loss", msg="gold entities do not match up" ) raise RuntimeError(err) gradients = self.distance.get_grad(sentence_encodings, entity_encodings) @@ -337,13 +342,13 @@ class EntityLinker(Pipe): final_kb_ids.append(candidates[0].entity_) else: random.shuffle(candidates) - # this will set all prior probabilities to 0 if they should be excluded from the model + # set all prior probabilities to 0 if incl_prior=False prior_probs = xp.asarray( [c.prior_prob for c in candidates] ) if not self.cfg.get("incl_prior"): prior_probs = xp.asarray( - [0.0 for c in candidates] + [0.0 for _ in candidates] ) scores = prior_probs # add in similarity from the context @@ -437,9 +442,8 @@ class EntityLinker(Pipe): raise ValueError(Errors.E149) def load_kb(p): - self.kb = KnowledgeBase( - vocab=self.vocab, entity_vector_length=self.cfg["entity_width"] - ) + self.kb = KnowledgeBase(entity_vector_length=self.cfg["entity_width"]) + self.kb.initialize(self.vocab) self.kb.load_bulk(p) deserialize = {} diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 97826aaa6..d85030adb 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -1,7 +1,7 @@ # cython: infer_types=True, profile=True, binding=True from typing import Optional import numpy -from thinc.api import CosineDistance, to_categorical, to_categorical, Model, Config +from thinc.api import CosineDistance, to_categorical, Model, Config from thinc.api import set_dropout_rate from ..tokens.doc cimport Doc @@ -9,7 +9,7 @@ from ..tokens.doc cimport Doc from .pipe import Pipe from .tagger import Tagger from ..language import Language -from ..syntax import nonproj +from ._parser_internals import nonproj from ..attrs import POS, ID from ..errors import Errors @@ -219,3 +219,6 @@ class ClozeMultitask(Pipe): if losses is not None: losses[self.name] += loss + + def add_label(self, label): + raise NotImplementedError diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 7ee4448fb..7f4fb8363 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -1,9 +1,9 @@ # cython: infer_types=True, profile=True, binding=True from typing import Optional, Iterable -from thinc.api import CosineDistance, to_categorical, get_array_module, Model, Config +from thinc.api import Model, Config -from ..syntax.nn_parser cimport Parser -from ..syntax.ner cimport BiluoPushDown +from .transition_parser cimport Parser +from ._parser_internals.ner cimport BiluoPushDown from ..language import Language from ..scorer import Scorer diff --git a/spacy/gold.pyx b/spacy/pipeline/nn_parser.pyx similarity index 100% rename from spacy/gold.pyx rename to spacy/pipeline/nn_parser.pyx diff --git a/spacy/pipeline/pipe.pxd b/spacy/pipeline/pipe.pxd new file mode 100644 index 000000000..bb97f79d0 --- /dev/null +++ b/spacy/pipeline/pipe.pxd @@ -0,0 +1,2 @@ +cdef class Pipe: + cdef public str name diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 196cdebdc..1a94905a2 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -8,7 +8,7 @@ from ..errors import Errors from .. import util -class Pipe: +cdef class Pipe: """This class is a base class and not instantiated directly. Trainable pipeline components like the EntityRecognizer or TextCategorizer inherit from it and it defines the interface that components should follow to @@ -17,8 +17,6 @@ class Pipe: DOCS: https://spacy.io/api/pipe """ - name = None - def __init__(self, vocab, model, name, **cfg): """Initialize a pipeline component. diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 31208ea2c..be4351212 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -203,3 +203,9 @@ class Sentencizer(Pipe): cfg = srsly.read_json(path) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self + + def get_loss(self, examples, scores): + raise NotImplementedError + + def add_label(self, label): + raise NotImplementedError diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index c6eb43661..620a8557e 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -108,8 +108,8 @@ class SentenceRecognizer(Tagger): truths = [] for eg in examples: eg_truth = [] - for x in eg.get_aligned("sent_start"): - if x == None: + for x in eg.get_aligned("SENT_START"): + if x is None: eg_truth.append(None) elif x == 1: eg_truth.append(labels[1]) diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index 9b9872b77..43a3283ca 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -4,12 +4,12 @@ from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model from thinc.api import Optimizer, Config from thinc.util import to_numpy +from ..errors import Errors from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob from ..tokens import Doc from ..language import Language from ..vocab import Vocab from ..scorer import Scorer -from .. import util from .pipe import Pipe @@ -37,7 +37,6 @@ DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"] default_config={"labels": [], "model": DEFAULT_SIMPLE_NER_MODEL}, scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, - ) def make_simple_ner( nlp: Language, name: str, model: Model, labels: Iterable[str] @@ -60,7 +59,9 @@ class SimpleNER(Pipe): self.vocab = vocab self.model = model self.name = name - self.labels = labels + self.cfg = {"labels": []} + for label in labels: + self.add_label(label) self.loss_func = SequenceCategoricalCrossentropy( names=self.get_tag_names(), normalize=True, missing_value=None ) @@ -70,9 +71,20 @@ class SimpleNER(Pipe): def is_biluo(self) -> bool: return self.model.name.startswith("biluo") + @property + def labels(self) -> Tuple[str]: + return tuple(self.cfg["labels"]) + def add_label(self, label: str) -> None: + """Add a new label to the pipe. + label (str): The label to add. + DOCS: https://spacy.io/api/simplener#add_label + """ + if not isinstance(label, str): + raise ValueError(Errors.E187) if label not in self.labels: - self.labels.append(label) + self.cfg["labels"].append(label) + self.vocab.strings.add(label) def get_tag_names(self) -> List[str]: if self.is_biluo: @@ -131,11 +143,9 @@ class SimpleNER(Pipe): return losses def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]: - loss = 0 - d_scores = [] truths = [] for eg in examples: - tags = eg.get_aligned("TAG", as_string=True) + tags = eg.get_aligned_ner() gold_tags = [(tag if tag != "-" else None) for tag in tags] if not self.is_biluo: gold_tags = biluo_to_iob(gold_tags) @@ -159,7 +169,6 @@ class SimpleNER(Pipe): if not hasattr(get_examples, "__call__"): gold_tuples = get_examples get_examples = lambda: gold_tuples - labels = _get_labels(get_examples()) for label in _get_labels(get_examples()): self.add_label(label) labels = self.labels diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index f2e06efed..43f5b02cb 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -259,7 +259,7 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#get_loss """ loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) - truths = [eg.get_aligned("tag", as_string=True) for eg in examples] + truths = [eg.get_aligned("TAG", as_string=True) for eg in examples] d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): raise ValueError("nan value when computing loss") diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 2c399defc..bc16e790f 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -238,8 +238,11 @@ class TextCategorizer(Pipe): DOCS: https://spacy.io/api/textcategorizer#rehearse """ + + if losses is not None: + losses.setdefault(self.name, 0.0) if self._rehearsal_model is None: - return + return losses try: docs = [eg.predicted for eg in examples] except AttributeError: @@ -250,7 +253,7 @@ class TextCategorizer(Pipe): raise TypeError(err) if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. - return + return losses set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update(docs) target = self._rehearsal_model(examples) @@ -259,7 +262,6 @@ class TextCategorizer(Pipe): if sgd is not None: self.model.finish_update(sgd) if losses is not None: - losses.setdefault(self.name, 0.0) losses[self.name] += (gradient ** 2).sum() return losses @@ -353,7 +355,7 @@ class TextCategorizer(Pipe): for cat in y.cats: self.add_label(cat) self.require_labels() - docs = [Doc(Vocab(), words=["hello"])] + docs = [Doc(self.vocab, words=["hello"])] truths, _ = self._examples_to_truth(examples) self.set_output(len(self.labels)) self.model.initialize(X=docs, Y=truths) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index b147cf177..31643a7d3 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -199,6 +199,9 @@ class Tok2Vec(Pipe): docs = [Doc(self.vocab, words=["hello"])] self.model.initialize(X=docs) + def add_label(self, label): + raise NotImplementedError + class Tok2VecListener(Model): """A layer that gets fed its answers from an upstream connection, diff --git a/spacy/syntax/nn_parser.pxd b/spacy/pipeline/transition_parser.pxd similarity index 62% rename from spacy/syntax/nn_parser.pxd rename to spacy/pipeline/transition_parser.pxd index 7840ec27a..e594a3098 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/pipeline/transition_parser.pxd @@ -1,16 +1,15 @@ -from .stateclass cimport StateClass -from .arc_eager cimport TransitionSystem +from cymem.cymem cimport Pool + from ..vocab cimport Vocab -from ..tokens.doc cimport Doc -from ..structs cimport TokenC -from ._state cimport StateC -from ._parser_model cimport WeightsC, ActivationsC, SizesC +from .pipe cimport Pipe +from ._parser_internals.transition_system cimport Transition, TransitionSystem +from ._parser_internals._state cimport StateC +from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC -cdef class Parser: +cdef class Parser(Pipe): cdef readonly Vocab vocab cdef public object model - cdef public str name cdef public object _rehearsal_model cdef readonly TransitionSystem moves cdef readonly object cfg diff --git a/spacy/syntax/nn_parser.pyx b/spacy/pipeline/transition_parser.pyx similarity index 95% rename from spacy/syntax/nn_parser.pyx rename to spacy/pipeline/transition_parser.pyx index a0ee13a0a..b14a55cb4 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -1,42 +1,32 @@ # cython: infer_types=True, cdivision=True, boundscheck=False -cimport cython.parallel +from __future__ import print_function +from cymem.cymem cimport Pool cimport numpy as np from itertools import islice -from cpython.ref cimport PyObject, Py_XDECREF -from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno -from libc.math cimport exp from libcpp.vector cimport vector -from libc.string cimport memset, memcpy +from libc.string cimport memset from libc.stdlib cimport calloc, free -from cymem.cymem cimport Pool -from thinc.backends.linalg cimport Vec, VecVec -from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops -from thinc.api import get_array_module, zero_init, set_dropout_rate -from itertools import islice import srsly + +from ._parser_internals.stateclass cimport StateClass +from ..ml.parser_model cimport alloc_activations, free_activations +from ..ml.parser_model cimport predict_states, arg_max_if_valid +from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss +from ..ml.parser_model cimport get_c_weights, get_c_sizes + +from ..tokens.doc cimport Doc +from ..errors import Errors, Warnings +from .. import util +from ..util import create_default_optimizer + +from thinc.api import set_dropout_rate import numpy.random import numpy import warnings -from ..tokens.doc cimport Doc -from ..typedefs cimport weight_t, class_t, hash_t -from ._parser_model cimport alloc_activations, free_activations -from ._parser_model cimport predict_states, arg_max_if_valid -from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss -from ._parser_model cimport get_c_weights, get_c_sizes -from .stateclass cimport StateClass -from ._state cimport StateC -from .transition_system cimport Transition -from ..util import create_default_optimizer, registry -from ..compat import copy_array -from ..errors import Errors, Warnings -from .. import util -from . import nonproj - - -cdef class Parser: +cdef class Parser(Pipe): """ Base class of the DependencyParser and EntityRecognizer. """ @@ -107,7 +97,7 @@ cdef class Parser: @property def tok2vec(self): - '''Return the embedding and convolutional layer of the model.''' + """Return the embedding and convolutional layer of the model.""" return self.model.get_ref("tok2vec") @property @@ -138,13 +128,13 @@ cdef class Parser: raise NotImplementedError def init_multitask_objectives(self, get_examples, pipeline, **cfg): - '''Setup models for secondary objectives, to benefit from multi-task + """Setup models for secondary objectives, to benefit from multi-task learning. This method is intended to be overridden by subclasses. For instance, the dependency parser can benefit from sharing an input representation with a label prediction model. These auxiliary models are discarded after training. - ''' + """ pass def use_params(self, params): diff --git a/spacy/schemas.py b/spacy/schemas.py index 971d283e2..d599ccbb2 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,4 +1,5 @@ from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type +from typing import Iterable, TypeVar, TYPE_CHECKING from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool @@ -8,6 +9,16 @@ from thinc.api import Optimizer from .attrs import NAMES +if TYPE_CHECKING: + # This lets us add type hints for mypy etc. without causing circular imports + from .language import Language # noqa: F401 + from .gold import Example # noqa: F401 + + +ItemT = TypeVar("ItemT") +Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]] +Reader = Callable[["Language", str], Iterable["Example"]] + def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: """Validate data against a given pydantic schema. @@ -181,30 +192,22 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off - base_model: Optional[StrictStr] = Field(..., title="The base model to use") vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - gold_preproc: StrictBool = Field(..., title="Whether to train on gold-standard sentences and tokens") - max_length: StrictInt = Field(..., title="Maximum length of examples (longer examples are divided into sentences if possible)") - limit: StrictInt = Field(..., title="Number of examples to use (0 for all)") - orth_variant_level: StrictFloat = Field(..., title="Orth variants for data augmentation") + train_corpus: Reader = Field(..., title="Reader for the training data") + dev_corpus: Reader = Field(..., title="Reader for the dev data") + batcher: Batcher = Field(..., title="Batcher for the training data") dropout: StrictFloat = Field(..., title="Dropout rate") patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score") max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for") eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)") - eval_batch_size: StrictInt = Field(..., title="Evaluation batch size") seed: Optional[StrictInt] = Field(..., title="Random seed") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") - use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch") score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") - discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size") - batch_by: StrictStr = Field(..., title="Batch examples by type") - raw_text: Optional[StrictStr] = Field(..., title="Raw text") - tag_map: Optional[StrictStr] = Field(..., title="Path to JSON-formatted tag map") - morph_rules: Optional[StrictStr] = Field(..., title="Path to morphology rules") - batch_size: Union[Sequence[int], int] = Field(..., title="The batch size or batch size schedule") + raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") optimizer: Optimizer = Field(..., title="The optimizer to use") + frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") # fmt: on class Config: @@ -219,6 +222,9 @@ class ConfigSchemaNlp(BaseModel): tokenizer: Callable = Field(..., title="The tokenizer to use") lemmatizer: Callable = Field(..., title="The lemmatizer to use") load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") + before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") + after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") + after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") # fmt: on class Config: diff --git a/spacy/scorer.py b/spacy/scorer.py index 702c74521..40a819e7c 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,55 +1,61 @@ +from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING import numpy as np +from .gold import Example +from .tokens import Token, Doc from .errors import Errors from .util import get_lang_class from .morphology import Morphology +if TYPE_CHECKING: + # This lets us add type hints for mypy etc. without causing circular imports + from .language import Language # noqa: F401 + + +DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"] + class PRFScore: - """ - A precision / recall / F score - """ + """A precision / recall / F score.""" - def __init__(self): + def __init__(self) -> None: self.tp = 0 self.fp = 0 self.fn = 0 - def score_set(self, cand, gold): + def score_set(self, cand: set, gold: set) -> None: self.tp += len(cand.intersection(gold)) self.fp += len(cand - gold) self.fn += len(gold - cand) @property - def precision(self): + def precision(self) -> float: return self.tp / (self.tp + self.fp + 1e-100) @property - def recall(self): + def recall(self) -> float: return self.tp / (self.tp + self.fn + 1e-100) @property - def fscore(self): + def fscore(self) -> float: p = self.precision r = self.recall return 2 * ((p * r) / (p + r + 1e-100)) - def to_dict(self): + def to_dict(self) -> Dict[str, float]: return {"p": self.precision, "r": self.recall, "f": self.fscore} class ROCAUCScore: - """ - An AUC ROC score. - """ + """An AUC ROC score.""" - def __init__(self): + def __init__(self) -> None: self.golds = [] self.cands = [] self.saved_score = 0.0 self.saved_score_at_len = 0 - def score_set(self, cand, gold): + def score_set(self, cand, gold) -> None: self.cands.append(cand) self.golds.append(gold) @@ -70,51 +76,52 @@ class ROCAUCScore: class Scorer: """Compute evaluation scores.""" - def __init__(self, nlp=None, **cfg): + def __init__( + self, + nlp: Optional["Language"] = None, + default_lang: str = "xx", + default_pipeline=DEFAULT_PIPELINE, + **cfg, + ) -> None: """Initialize the Scorer. DOCS: https://spacy.io/api/scorer#init """ self.nlp = nlp self.cfg = cfg - if not nlp: - # create a default pipeline - nlp = get_lang_class("xx")() - nlp.add_pipe("senter") - nlp.add_pipe("tagger") - nlp.add_pipe("morphologizer") - nlp.add_pipe("parser") - nlp.add_pipe("ner") - nlp.add_pipe("textcat") + nlp = get_lang_class(default_lang)() + for pipe in default_pipeline: + nlp.add_pipe(pipe) self.nlp = nlp - def score(self, examples): + def score(self, examples: Iterable[Example]) -> Dict[str, Any]: """Evaluate a list of Examples. examples (Iterable[Example]): The predicted annotations + correct annotations. RETURNS (Dict): A dictionary of scores. + DOCS: https://spacy.io/api/scorer#score """ scores = {} - if hasattr(self.nlp.tokenizer, "score"): scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) for name, component in self.nlp.pipeline: if hasattr(component, "score"): scores.update(component.score(examples, **self.cfg)) - return scores @staticmethod - def score_tokenization(examples, **cfg): + def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]: """Returns accuracy and PRF scores for tokenization. - * token_acc: # correct tokens / # gold tokens * token_p/r/f: PRF for token character spans examples (Iterable[Example]): Examples to score - RETURNS (dict): A dictionary containing the scores token_acc/p/r/f. + RETURNS (Dict[str, float]): A dictionary containing the scores + token_acc/p/r/f. + + DOCS: https://spacy.io/api/scorer#score_tokenization """ acc_score = PRFScore() prf_score = PRFScore() @@ -145,16 +152,24 @@ class Scorer: } @staticmethod - def score_token_attr(examples, attr, getter=getattr, **cfg): + def score_token_attr( + examples: Iterable[Example], + attr: str, + *, + getter: Callable[[Token, str], Any] = getattr, + **cfg, + ) -> Dict[str, float]: """Returns an accuracy score for a token-level attribute. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. - getter (callable): Defaults to getattr. If provided, + getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. - RETURNS (dict): A dictionary containing the accuracy score under the - key attr_acc. + RETURNS (Dict[str, float]): A dictionary containing the accuracy score + under the key attr_acc. + + DOCS: https://spacy.io/api/scorer#score_token_attr """ tag_score = PRFScore() for example in examples: @@ -172,17 +187,21 @@ class Scorer: gold_i = align.x2y[token.i].dataXd[0, 0] pred_tags.add((gold_i, getter(token, attr))) tag_score.score_set(pred_tags, gold_tags) - return { - attr + "_acc": tag_score.fscore, - } + return {f"{attr}_acc": tag_score.fscore} @staticmethod - def score_token_attr_per_feat(examples, attr, getter=getattr, **cfg): + def score_token_attr_per_feat( + examples: Iterable[Example], + attr: str, + *, + getter: Callable[[Token, str], Any] = getattr, + **cfg, + ): """Return PRF scores per feat for a token attribute in UFEATS format. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. - getter (callable): Defaults to getattr. If provided, + getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. RETURNS (dict): A dictionary containing the per-feat PRF scores unders @@ -223,20 +242,26 @@ class Scorer: per_feat[field].score_set( pred_per_feat.get(field, set()), gold_per_feat.get(field, set()), ) - return { - attr + "_per_feat": per_feat, - } + return {f"{attr}_per_feat": per_feat} @staticmethod - def score_spans(examples, attr, getter=getattr, **cfg): + def score_spans( + examples: Iterable[Example], + attr: str, + *, + getter: Callable[[Doc, str], Any] = getattr, + **cfg, + ) -> Dict[str, Any]: """Returns PRF scores for labeled spans. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. - getter (callable): Defaults to getattr. If provided, + getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided, getter(doc, attr) should return the spans for the individual doc. - RETURNS (dict): A dictionary containing the PRF scores under the - keys attr_p/r/f and the per-type PRF scores under attr_per_type. + RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under + the keys attr_p/r/f and the per-type PRF scores under attr_per_type. + + DOCS: https://spacy.io/api/scorer#score_spans """ score = PRFScore() score_per_type = dict() @@ -256,14 +281,12 @@ class Scorer: # Find all predidate labels, for all and per type gold_spans = set() pred_spans = set() - # Special case for ents: # If we have missing values in the gold, we can't easily tell # whether our NER predictions are true. # It seems bad but it's what we've always done. if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc): continue - for span in getter(gold_doc, attr): gold_span = (span.label_, span.start, span.end - 1) gold_spans.add(gold_span) @@ -279,38 +302,39 @@ class Scorer: # Score for all labels score.score_set(pred_spans, gold_spans) results = { - attr + "_p": score.precision, - attr + "_r": score.recall, - attr + "_f": score.fscore, - attr + "_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, + f"{attr}_p": score.precision, + f"{attr}_r": score.recall, + f"{attr}_f": score.fscore, + f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, } return results @staticmethod def score_cats( - examples, - attr, - getter=getattr, - labels=[], - multi_label=True, - positive_label=None, - **cfg - ): + examples: Iterable[Example], + attr: str, + *, + getter: Callable[[Doc, str], Any] = getattr, + labels: Iterable[str] = tuple(), + multi_label: bool = True, + positive_label: Optional[str] = None, + **cfg, + ) -> Dict[str, Any]: """Returns PRF and ROC AUC scores for a doc-level attribute with a dict with scores for each label like Doc.cats. The reported overall score depends on the scorer settings. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. - getter (callable): Defaults to getattr. If provided, + getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided, getter(doc, attr) should return the values for the individual doc. labels (Iterable[str]): The set of possible labels. Defaults to []. multi_label (bool): Whether the attribute allows multiple labels. Defaults to True. positive_label (str): The positive label for a binary task with exclusive classes. Defaults to None. - RETURNS (dict): A dictionary containing the scores, with inapplicable - scores as None: + RETURNS (Dict[str, Any]): A dictionary containing the scores, with + inapplicable scores as None: for all: attr_score (one of attr_f / attr_macro_f / attr_macro_auc), attr_score_desc (text description of the overall score), @@ -319,6 +343,8 @@ class Scorer: for binary exclusive with positive label: attr_p/r/f for 3+ exclusive classes, macro-averaged fscore: attr_macro_f for multilabel, macro-averaged AUC: attr_macro_auc + + DOCS: https://spacy.io/api/scorer#score_cats """ score = PRFScore() f_per_type = dict() @@ -367,64 +393,67 @@ class Scorer: ) ) results = { - attr + "_score": None, - attr + "_score_desc": None, - attr + "_p": None, - attr + "_r": None, - attr + "_f": None, - attr + "_macro_f": None, - attr + "_macro_auc": None, - attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, - attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, + f"{attr}_score": None, + f"{attr}_score_desc": None, + f"{attr}_p": None, + f"{attr}_r": None, + f"{attr}_f": None, + f"{attr}_macro_f": None, + f"{attr}_macro_auc": None, + f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, + f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, } if len(labels) == 2 and not multi_label and positive_label: - results[attr + "_p"] = score.precision - results[attr + "_r"] = score.recall - results[attr + "_f"] = score.fscore - results[attr + "_score"] = results[attr + "_f"] - results[attr + "_score_desc"] = "F (" + positive_label + ")" + results[f"{attr}_p"] = score.precision + results[f"{attr}_r"] = score.recall + results[f"{attr}_f"] = score.fscore + results[f"{attr}_score"] = results[f"{attr}_f"] + results[f"{attr}_score_desc"] = f"F ({positive_label})" elif not multi_label: - results[attr + "_macro_f"] = sum( + results[f"{attr}_macro_f"] = sum( [score.fscore for label, score in f_per_type.items()] ) / (len(f_per_type) + 1e-100) - results[attr + "_score"] = results[attr + "_macro_f"] - results[attr + "_score_desc"] = "macro F" + results[f"{attr}_score"] = results[f"{attr}_macro_f"] + results[f"{attr}_score_desc"] = "macro F" else: - results[attr + "_macro_auc"] = max( + results[f"{attr}_macro_auc"] = max( sum([score.score for label, score in auc_per_type.items()]) / (len(auc_per_type) + 1e-100), -1, ) - results[attr + "_score"] = results[attr + "_macro_auc"] - results[attr + "_score_desc"] = "macro AUC" + results[f"{attr}_score"] = results[f"{attr}_macro_auc"] + results[f"{attr}_score_desc"] = "macro AUC" return results @staticmethod def score_deps( - examples, - attr, - getter=getattr, - head_attr="head", - head_getter=getattr, - ignore_labels=tuple(), - **cfg - ): + examples: Iterable[Example], + attr: str, + *, + getter: Callable[[Token, str], Any] = getattr, + head_attr: str = "head", + head_getter: Callable[[Token, str], Any] = getattr, + ignore_labels: Tuple[str] = tuple(), + **cfg, + ) -> Dict[str, Any]: """Returns the UAS, LAS, and LAS per type scores for dependency parses. examples (Iterable[Example]): Examples to score attr (str): The attribute containing the dependency label. - getter (callable): Defaults to getattr. If provided, + getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. head_attr (str): The attribute containing the head token. Defaults to 'head'. - head_getter (callable): Defaults to getattr. If provided, + head_getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, head_getter(token, attr) should return the value of the head for an individual token. ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct). - RETURNS (dict): A dictionary containing the scores: + RETURNS (Dict[str, Any]): A dictionary containing the scores: attr_uas, attr_las, and attr_las_per_type. + + DOCS: https://spacy.io/api/scorer#score_deps """ unlabelled = PRFScore() labelled = PRFScore() @@ -482,10 +511,11 @@ class Scorer: set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps) ) return { - attr + "_uas": unlabelled.fscore, - attr + "_las": labelled.fscore, - attr - + "_las_per_type": {k: v.to_dict() for k, v in labelled_per_dep.items()}, + f"{attr}_uas": unlabelled.fscore, + f"{attr}_las": labelled.fscore, + f"{attr}_las_per_type": { + k: v.to_dict() for k, v in labelled_per_dep.items() + }, } diff --git a/spacy/syntax/__init__.pxd b/spacy/syntax/__init__.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 91b0ec922..79e8f31c0 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -282,3 +282,15 @@ def test_span_eq_hash(doc, doc_not_parsed): assert hash(doc[0:2]) == hash(doc[0:2]) assert hash(doc[0:2]) != hash(doc[1:3]) assert hash(doc[0:2]) != hash(doc_not_parsed[0:2]) + + +def test_span_boundaries(doc): + start = 1 + end = 5 + span = doc[start:end] + for i in range(start, end): + assert span[i - start] == doc[i] + with pytest.raises(IndexError): + span[-5] + with pytest.raises(IndexError): + span[5] diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index 015f92785..1c6fdf419 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -29,9 +29,7 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba): def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): nlp = Chinese( meta={ - "tokenizer": { - "config": {"segmenter": "pkuseg", "pkuseg_model": "medicine",} - } + "tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}} } ) zh_tokenizer_serialize(nlp.tokenizer) diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 8f4c13471..5f4c2991a 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -21,7 +21,7 @@ re_pattern5 = "B*A*B" longest1 = "A A A A A" longest2 = "A A A A A" longest3 = "A A" -longest4 = "B A A A A A B" # "FIRST" would be "B B" +longest4 = "B A A A A A B" # "FIRST" would be "B B" longest5 = "B B A A A A A B" diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 77e142215..fd1880030 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -4,8 +4,8 @@ from spacy import registry from spacy.gold import Example from spacy.pipeline import DependencyParser from spacy.tokens import Doc -from spacy.syntax.nonproj import projectivize -from spacy.syntax.arc_eager import ArcEager +from spacy.pipeline._parser_internals.nonproj import projectivize +from spacy.pipeline._parser_internals.arc_eager import ArcEager from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 4a6bf73a5..dbeb0a9cb 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -5,7 +5,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.lookups import Lookups -from spacy.syntax.ner import BiluoPushDown +from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.gold import Example from spacy.tokens import Doc from spacy.vocab import Vocab @@ -210,7 +210,7 @@ def test_train_empty(): nlp.begin_training() for itn in range(2): losses = {} - batches = util.minibatch(train_examples) + batches = util.minibatch(train_examples, size=8) for batch in batches: nlp.update(batch, losses=losses) diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index feae52f7f..6594c7e78 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -3,8 +3,8 @@ import pytest from spacy import registry from spacy.gold import Example from spacy.vocab import Vocab -from spacy.syntax.arc_eager import ArcEager -from spacy.syntax.nn_parser import Parser +from spacy.pipeline._parser_internals.arc_eager import ArcEager +from spacy.pipeline.transition_parser import Parser from spacy.tokens.doc import Doc from thinc.api import Model from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 496ec7e03..41da7cf49 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -1,7 +1,7 @@ import pytest -from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc -from spacy.syntax.nonproj import is_nonproj_tree -from spacy.syntax import nonproj +from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle +from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc +from spacy.pipeline._parser_internals import nonproj from ..util import get_doc diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index 4e1407707..df3d7dff5 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,15 +1,10 @@ -import spacy.language from spacy.language import Language -from spacy.pipe_analysis import print_summary, validate_attrs -from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr -from spacy.pipe_analysis import count_pipeline_interdependencies +from spacy.pipe_analysis import get_attr_info, validate_attrs from mock import Mock import pytest def test_component_decorator_assigns(): - spacy.language.ENABLE_PIPELINE_ANALYSIS = True - @Language.component("c1", assigns=["token.tag", "doc.tensor"]) def test_component1(doc): return doc @@ -32,10 +27,11 @@ def test_component_decorator_assigns(): nlp = Language() nlp.add_pipe("c1") - with pytest.warns(UserWarning): - nlp.add_pipe("c2") + nlp.add_pipe("c2") + problems = nlp.analyze_pipes()["problems"] + assert problems["c2"] == ["token.pos"] nlp.add_pipe("c3") - assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2"] + assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2"] nlp.add_pipe("c1", name="c4") test_component4_meta = nlp.get_pipe_meta("c1") assert test_component4_meta.factory == "c1" @@ -43,9 +39,8 @@ def test_component_decorator_assigns(): assert not Language.has_factory("c4") assert nlp.pipe_factories["c1"] == "c1" assert nlp.pipe_factories["c4"] == "c1" - assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2", "c4"] - assert get_requires_for_attr(nlp, "token.pos") == ["c2"] - assert print_summary(nlp, no_print=True) + assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2", "c4"] + assert get_attr_info(nlp, "token.pos")["requires"] == ["c2"] assert nlp("hello world") @@ -100,7 +95,6 @@ def test_analysis_validate_attrs_invalid(attr): def test_analysis_validate_attrs_remove_pipe(): """Test that attributes are validated correctly on remove.""" - spacy.language.ENABLE_PIPELINE_ANALYSIS = True @Language.component("pipe_analysis_c6", assigns=["token.tag"]) def c1(doc): @@ -112,26 +106,9 @@ def test_analysis_validate_attrs_remove_pipe(): nlp = Language() nlp.add_pipe("pipe_analysis_c6") - with pytest.warns(UserWarning): - nlp.add_pipe("pipe_analysis_c7") - with pytest.warns(None) as record: - nlp.remove_pipe("pipe_analysis_c7") - assert not record.list - - -def test_pipe_interdependencies(): - prefix = "test_pipe_interdependencies" - - @Language.component(f"{prefix}.fancifier", assigns=("doc._.fancy",)) - def fancifier(doc): - return doc - - @Language.component(f"{prefix}.needer", requires=("doc._.fancy",)) - def needer(doc): - return doc - - nlp = Language() - nlp.add_pipe(f"{prefix}.fancifier") - nlp.add_pipe(f"{prefix}.needer") - counts = count_pipeline_interdependencies(nlp) - assert counts == [1, 0] + nlp.add_pipe("pipe_analysis_c7") + problems = nlp.analyze_pipes()["problems"] + assert problems["pipe_analysis_c7"] == ["token.pos"] + nlp.remove_pipe("pipe_analysis_c7") + problems = nlp.analyze_pipes()["problems"] + assert all(p == [] for p in problems.values()) diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py new file mode 100644 index 000000000..bcde7bf63 --- /dev/null +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -0,0 +1,207 @@ +import pytest +import numpy +from spacy.lang.en import English +from spacy.pipeline import AttributeRuler +from spacy import util, registry + +from ..util import get_doc, make_tempdir + + +@pytest.fixture +def nlp(): + return English() + + +@pytest.fixture +def pattern_dicts(): + return [ + { + "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]], + "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, + }, + # one pattern sets the lemma + {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}}, + # another pattern sets the morphology + { + "patterns": [[{"ORTH": "test"}]], + "attrs": {"MORPH": "Case=Nom|Number=Sing"}, + "index": 0, + }, + ] + + +@registry.assets("attribute_ruler_patterns") +def attribute_ruler_patterns(): + return [ + { + "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]], + "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, + }, + # one pattern sets the lemma + {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}}, + # another pattern sets the morphology + { + "patterns": [[{"ORTH": "test"}]], + "attrs": {"MORPH": "Case=Nom|Number=Sing"}, + "index": 0, + }, + ] + + +@pytest.fixture +def tag_map(): + return { + ".": {"POS": "PUNCT", "PunctType": "peri"}, + ",": {"POS": "PUNCT", "PunctType": "comm"}, + } + + +@pytest.fixture +def morph_rules(): + return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}} + + +def test_attributeruler_init(nlp, pattern_dicts): + a = nlp.add_pipe("attribute_ruler") + for p in pattern_dicts: + a.add(**p) + + doc = nlp("This is a test.") + assert doc[2].lemma_ == "the" + assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert doc[3].lemma_ == "cat" + assert doc[3].morph_ == "Case=Nom|Number=Sing" + + +def test_attributeruler_init_patterns(nlp, pattern_dicts): + # initialize with patterns + nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) + doc = nlp("This is a test.") + assert doc[2].lemma_ == "the" + assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert doc[3].lemma_ == "cat" + assert doc[3].morph_ == "Case=Nom|Number=Sing" + nlp.remove_pipe("attribute_ruler") + # initialize with patterns from asset + nlp.add_pipe( + "attribute_ruler", + config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}}, + ) + doc = nlp("This is a test.") + assert doc[2].lemma_ == "the" + assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert doc[3].lemma_ == "cat" + assert doc[3].morph_ == "Case=Nom|Number=Sing" + + +def test_attributeruler_tag_map(nlp, tag_map): + a = AttributeRuler(nlp.vocab) + a.load_from_tag_map(tag_map) + doc = get_doc( + nlp.vocab, + words=["This", "is", "a", "test", "."], + tags=["DT", "VBZ", "DT", "NN", "."], + ) + doc = a(doc) + + for i in range(len(doc)): + if i == 4: + assert doc[i].pos_ == "PUNCT" + assert doc[i].morph_ == "PunctType=peri" + else: + assert doc[i].pos_ == "" + assert doc[i].morph_ == "" + + +def test_attributeruler_morph_rules(nlp, morph_rules): + a = AttributeRuler(nlp.vocab) + a.load_from_morph_rules(morph_rules) + doc = get_doc( + nlp.vocab, + words=["This", "is", "the", "test", "."], + tags=["DT", "VBZ", "DT", "NN", "."], + ) + doc = a(doc) + + for i in range(len(doc)): + if i != 2: + assert doc[i].pos_ == "" + assert doc[i].morph_ == "" + else: + assert doc[2].pos_ == "DET" + assert doc[2].lemma_ == "a" + assert doc[2].morph_ == "Case=Nom" + + +def test_attributeruler_indices(nlp): + a = nlp.add_pipe("attribute_ruler") + a.add( + [[{"ORTH": "a"}, {"ORTH": "test"}]], + {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, + index=0, + ) + a.add( + [[{"ORTH": "This"}, {"ORTH": "is"}]], + {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, + index=1, + ) + a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1) + + text = "This is a test." + doc = nlp(text) + + for i in range(len(doc)): + if i == 1: + assert doc[i].lemma_ == "was" + assert doc[i].morph_ == "Case=Nom|Number=Sing" + elif i == 2: + assert doc[i].lemma_ == "the" + assert doc[i].morph_ == "Case=Nom|Number=Plur" + elif i == 3: + assert doc[i].lemma_ == "cat" + else: + assert doc[i].morph_ == "" + + # raises an error when trying to modify a token outside of the match + a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2) + with pytest.raises(ValueError): + doc = nlp(text) + + # raises an error when trying to modify a token outside of the match + a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10) + with pytest.raises(ValueError): + doc = nlp(text) + + +def test_attributeruler_patterns_prop(nlp, pattern_dicts): + a = nlp.add_pipe("attribute_ruler") + a.add_patterns(pattern_dicts) + + for p1, p2 in zip(pattern_dicts, a.patterns): + assert p1["patterns"] == p2["patterns"] + assert p1["attrs"] == p2["attrs"] + if p1.get("index"): + assert p1["index"] == p2["index"] + + +def test_attributeruler_serialize(nlp, pattern_dicts): + a = nlp.add_pipe("attribute_ruler") + a.add_patterns(pattern_dicts) + + text = "This is a test." + attrs = ["ORTH", "LEMMA", "MORPH"] + doc = nlp(text) + + # bytes roundtrip + a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes()) + assert a.to_bytes() == a_reloaded.to_bytes() + doc1 = a_reloaded(nlp.make_doc(text)) + numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs)) + + # disk roundtrip + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(text) + assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes() + assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs)) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 4002eafe3..bb93cf118 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -21,7 +21,8 @@ def assert_almost_equal(a, b): def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" - mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + mykb = KnowledgeBase(entity_vector_length=3) + mykb.initialize(nlp.vocab) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3]) @@ -50,7 +51,8 @@ def test_kb_valid_entities(nlp): def test_kb_invalid_entities(nlp): """Test the invalid construction of a KB with an alias linked to a non-existing entity""" - mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + mykb = KnowledgeBase(entity_vector_length=1) + mykb.initialize(nlp.vocab) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) @@ -66,7 +68,8 @@ def test_kb_invalid_entities(nlp): def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" - mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + mykb = KnowledgeBase(entity_vector_length=1) + mykb.initialize(nlp.vocab) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) @@ -80,7 +83,8 @@ def test_kb_invalid_probabilities(nlp): def test_kb_invalid_combination(nlp): """Test the invalid construction of a KB with non-matching entity and probability lists""" - mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + mykb = KnowledgeBase(entity_vector_length=1) + mykb.initialize(nlp.vocab) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) @@ -96,7 +100,8 @@ def test_kb_invalid_combination(nlp): def test_kb_invalid_entity_vector(nlp): """Test the invalid construction of a KB with non-matching entity vector lengths""" - mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + mykb = KnowledgeBase(entity_vector_length=3) + mykb.initialize(nlp.vocab) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3]) @@ -106,9 +111,47 @@ def test_kb_invalid_entity_vector(nlp): mykb.add_entity(entity="Q2", freq=5, entity_vector=[2]) +def test_kb_default(nlp): + """Test that the default (empty) KB is loaded when not providing a config""" + entity_linker = nlp.add_pipe("entity_linker", config={}) + assert len(entity_linker.kb) == 0 + assert entity_linker.kb.get_size_entities() == 0 + assert entity_linker.kb.get_size_aliases() == 0 + # default value from pipeline.entity_linker + assert entity_linker.kb.entity_vector_length == 64 + + +def test_kb_custom_length(nlp): + """Test that the default (empty) KB can be configured with a custom entity length""" + entity_linker = nlp.add_pipe( + "entity_linker", config={"kb": {"entity_vector_length": 35}} + ) + assert len(entity_linker.kb) == 0 + assert entity_linker.kb.get_size_entities() == 0 + assert entity_linker.kb.get_size_aliases() == 0 + assert entity_linker.kb.entity_vector_length == 35 + + +def test_kb_undefined(nlp): + """Test that the EL can't train without defining a KB""" + entity_linker = nlp.add_pipe("entity_linker", config={}) + with pytest.raises(ValueError): + entity_linker.begin_training() + + +def test_kb_empty(nlp): + """Test that the EL can't train with an empty KB""" + config = {"kb": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}} + entity_linker = nlp.add_pipe("entity_linker", config=config) + assert len(entity_linker.kb) == 0 + with pytest.raises(ValueError): + entity_linker.begin_training() + + def test_candidate_generation(nlp): """Test correct candidate generation""" - mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + mykb = KnowledgeBase(entity_vector_length=1) + mykb.initialize(nlp.vocab) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) @@ -133,7 +176,8 @@ def test_candidate_generation(nlp): def test_append_alias(nlp): """Test that we can append additional alias-entity pairs""" - mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + mykb = KnowledgeBase(entity_vector_length=1) + mykb.initialize(nlp.vocab) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) @@ -163,7 +207,8 @@ def test_append_alias(nlp): def test_append_invalid_alias(nlp): """Test that append an alias will throw an error if prior probs are exceeding 1""" - mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + mykb = KnowledgeBase(entity_vector_length=1) + mykb.initialize(nlp.vocab) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) @@ -184,7 +229,8 @@ def test_preserving_links_asdoc(nlp): @registry.assets.register("myLocationsKB.v1") def dummy_kb() -> KnowledgeBase: - mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + mykb = KnowledgeBase(entity_vector_length=1) + mykb.initialize(nlp.vocab) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) @@ -289,7 +335,8 @@ def test_overfitting_IO(): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher - mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + mykb = KnowledgeBase(entity_vector_length=3) + mykb.initialize(nlp.vocab) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 64c6c2d6f..9948f6bcd 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -8,6 +8,8 @@ from thinc.api import Model, Linear from thinc.config import ConfigValidationError from pydantic import StrictInt, StrictStr +from ..util import make_tempdir + def test_pipe_function_component(): name = "test_component" @@ -374,3 +376,65 @@ def test_language_factories_scores(): cfg = nlp.config["training"] expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} assert cfg["score_weights"] == expected_weights + + +def test_pipe_factories_from_source(): + """Test adding components from a source model.""" + source_nlp = English() + source_nlp.add_pipe("tagger", name="my_tagger") + nlp = English() + with pytest.raises(ValueError): + nlp.add_pipe("my_tagger", source="en_core_web_sm") + nlp.add_pipe("my_tagger", source=source_nlp) + assert "my_tagger" in nlp.pipe_names + with pytest.raises(KeyError): + nlp.add_pipe("custom", source=source_nlp) + + +def test_pipe_factories_from_source_custom(): + """Test adding components from a source model with custom components.""" + name = "test_pipe_factories_from_source_custom" + + @Language.factory(name, default_config={"arg": "hello"}) + def test_factory(nlp, name, arg: str): + return lambda doc: doc + + source_nlp = English() + source_nlp.add_pipe("tagger") + source_nlp.add_pipe(name, config={"arg": "world"}) + nlp = English() + nlp.add_pipe(name, source=source_nlp) + assert name in nlp.pipe_names + assert nlp.get_pipe_meta(name).default_config["arg"] == "hello" + config = nlp.config["components"][name] + assert config["factory"] == name + assert config["arg"] == "world" + + +def test_pipe_factories_from_source_config(): + name = "test_pipe_factories_from_source_config" + + @Language.factory(name, default_config={"arg": "hello"}) + def test_factory(nlp, name, arg: str): + return lambda doc: doc + + source_nlp = English() + source_nlp.add_pipe("tagger") + source_nlp.add_pipe(name, name="yolo", config={"arg": "world"}) + dest_nlp_cfg = {"lang": "en", "pipeline": ["parser", "custom"]} + with make_tempdir() as tempdir: + source_nlp.to_disk(tempdir) + dest_components_cfg = { + "parser": {"factory": "parser"}, + "custom": {"source": str(tempdir), "component": "yolo"}, + } + dest_config = {"nlp": dest_nlp_cfg, "components": dest_components_cfg} + nlp = English.from_config(dest_config) + assert nlp.pipe_names == ["parser", "custom"] + assert nlp.pipe_factories == {"parser": "parser", "custom": name} + meta = nlp.get_pipe_meta("custom") + assert meta.factory == name + assert meta.default_config["arg"] == "hello" + config = nlp.config["components"]["custom"] + assert config["factory"] == name + assert config["arg"] == "world" diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index e37375bf1..0141708b4 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -70,6 +70,14 @@ def test_replace_pipe(nlp, name, replacement, invalid_replacement): assert nlp.get_pipe(name) == nlp.create_pipe(replacement) +def test_replace_last_pipe(nlp): + nlp.add_pipe("sentencizer") + nlp.add_pipe("ner") + assert nlp.pipe_names == ["sentencizer", "ner"] + nlp.replace_pipe("ner", "ner") + assert nlp.pipe_names == ["sentencizer", "ner"] + + @pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")]) def test_rename_pipe(nlp, old_name, new_name): with pytest.raises(ValueError): diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py index 024d7bd26..b012a2cd6 100644 --- a/spacy/tests/pipeline/test_simple_ner.py +++ b/spacy/tests/pipeline/test_simple_ner.py @@ -1,418 +1,45 @@ -import pytest -from collections import namedtuple -from thinc.api import NumpyOps -from spacy.ml._biluo import BILUO, _get_transition_table +from spacy.lang.en import English +from spacy.gold import Example +from spacy import util +from ..util import make_tempdir -@pytest.fixture( - params=[ - ["PER", "ORG", "LOC", "MISC"], - ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"], - ] -) -def labels(request): - return request.param +TRAIN_DATA = [ + ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), + ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), +] -@pytest.fixture -def ops(): - return NumpyOps() +def test_overfitting_IO(): + # Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly + nlp = English() + ner = nlp.add_pipe("simple_ner") + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for ent in annotations.get("entities"): + ner.add_label(ent[2]) + optimizer = nlp.begin_training() + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + assert losses["ner"] < 0.0001 -def _get_actions(labels): - action_names = ( - [f"B{label}" for label in labels] - + [f"I{label}" for label in labels] - + [f"L{label}" for label in labels] - + [f"U{label}" for label in labels] - + ["O"] - ) - A = namedtuple("actions", action_names) - return A(**{name: i for i, name in enumerate(action_names)}) + # test the trained model + test_text = "I like London." + doc = nlp(test_text) + ents = doc.ents + assert len(ents) == 1 + assert ents[0].text == "London" + assert ents[0].label_ == "LOC" - -def test_init_biluo_layer(labels): - model = BILUO() - model.set_dim("nO", model.attrs["get_num_actions"](len(labels))) - model.initialize() - assert model.get_dim("nO") == len(labels) * 4 + 1 - - -def test_transition_table(ops): - labels = ["per", "loc", "org"] - table = _get_transition_table(len(labels)) - a = _get_actions(labels) - assert table.shape == (2, len(a), len(a)) - # Not last token, prev action was B - assert table[0, a.Bper, a.Bper] == 0 - assert table[0, a.Bper, a.Bloc] == 0 - assert table[0, a.Bper, a.Borg] == 0 - assert table[0, a.Bper, a.Iper] == 1 - assert table[0, a.Bper, a.Iloc] == 0 - assert table[0, a.Bper, a.Iorg] == 0 - assert table[0, a.Bper, a.Lper] == 1 - assert table[0, a.Bper, a.Lloc] == 0 - assert table[0, a.Bper, a.Lorg] == 0 - assert table[0, a.Bper, a.Uper] == 0 - assert table[0, a.Bper, a.Uloc] == 0 - assert table[0, a.Bper, a.Uorg] == 0 - assert table[0, a.Bper, a.O] == 0 - - assert table[0, a.Bloc, a.Bper] == 0 - assert table[0, a.Bloc, a.Bloc] == 0 - assert table[0, a.Bloc, a.Borg] == 0 - assert table[0, a.Bloc, a.Iper] == 0 - assert table[0, a.Bloc, a.Iloc] == 1 - assert table[0, a.Bloc, a.Iorg] == 0 - assert table[0, a.Bloc, a.Lper] == 0 - assert table[0, a.Bloc, a.Lloc] == 1 - assert table[0, a.Bloc, a.Lorg] == 0 - assert table[0, a.Bloc, a.Uper] == 0 - assert table[0, a.Bloc, a.Uloc] == 0 - assert table[0, a.Bloc, a.Uorg] == 0 - assert table[0, a.Bloc, a.O] == 0 - - assert table[0, a.Borg, a.Bper] == 0 - assert table[0, a.Borg, a.Bloc] == 0 - assert table[0, a.Borg, a.Borg] == 0 - assert table[0, a.Borg, a.Iper] == 0 - assert table[0, a.Borg, a.Iloc] == 0 - assert table[0, a.Borg, a.Iorg] == 1 - assert table[0, a.Borg, a.Lper] == 0 - assert table[0, a.Borg, a.Lloc] == 0 - assert table[0, a.Borg, a.Lorg] == 1 - assert table[0, a.Borg, a.Uper] == 0 - assert table[0, a.Borg, a.Uloc] == 0 - assert table[0, a.Borg, a.Uorg] == 0 - assert table[0, a.Borg, a.O] == 0 - - # Not last token, prev action was I - assert table[0, a.Iper, a.Bper] == 0 - assert table[0, a.Iper, a.Bloc] == 0 - assert table[0, a.Iper, a.Borg] == 0 - assert table[0, a.Iper, a.Iper] == 1 - assert table[0, a.Iper, a.Iloc] == 0 - assert table[0, a.Iper, a.Iorg] == 0 - assert table[0, a.Iper, a.Lper] == 1 - assert table[0, a.Iper, a.Lloc] == 0 - assert table[0, a.Iper, a.Lorg] == 0 - assert table[0, a.Iper, a.Uper] == 0 - assert table[0, a.Iper, a.Uloc] == 0 - assert table[0, a.Iper, a.Uorg] == 0 - assert table[0, a.Iper, a.O] == 0 - - assert table[0, a.Iloc, a.Bper] == 0 - assert table[0, a.Iloc, a.Bloc] == 0 - assert table[0, a.Iloc, a.Borg] == 0 - assert table[0, a.Iloc, a.Iper] == 0 - assert table[0, a.Iloc, a.Iloc] == 1 - assert table[0, a.Iloc, a.Iorg] == 0 - assert table[0, a.Iloc, a.Lper] == 0 - assert table[0, a.Iloc, a.Lloc] == 1 - assert table[0, a.Iloc, a.Lorg] == 0 - assert table[0, a.Iloc, a.Uper] == 0 - assert table[0, a.Iloc, a.Uloc] == 0 - assert table[0, a.Iloc, a.Uorg] == 0 - assert table[0, a.Iloc, a.O] == 0 - - assert table[0, a.Iorg, a.Bper] == 0 - assert table[0, a.Iorg, a.Bloc] == 0 - assert table[0, a.Iorg, a.Borg] == 0 - assert table[0, a.Iorg, a.Iper] == 0 - assert table[0, a.Iorg, a.Iloc] == 0 - assert table[0, a.Iorg, a.Iorg] == 1 - assert table[0, a.Iorg, a.Lper] == 0 - assert table[0, a.Iorg, a.Lloc] == 0 - assert table[0, a.Iorg, a.Lorg] == 1 - assert table[0, a.Iorg, a.Uper] == 0 - assert table[0, a.Iorg, a.Uloc] == 0 - assert table[0, a.Iorg, a.Uorg] == 0 - assert table[0, a.Iorg, a.O] == 0 - - # Not last token, prev action was L - assert table[0, a.Lper, a.Bper] == 1 - assert table[0, a.Lper, a.Bloc] == 1 - assert table[0, a.Lper, a.Borg] == 1 - assert table[0, a.Lper, a.Iper] == 0 - assert table[0, a.Lper, a.Iloc] == 0 - assert table[0, a.Lper, a.Iorg] == 0 - assert table[0, a.Lper, a.Lper] == 0 - assert table[0, a.Lper, a.Lloc] == 0 - assert table[0, a.Lper, a.Lorg] == 0 - assert table[0, a.Lper, a.Uper] == 1 - assert table[0, a.Lper, a.Uloc] == 1 - assert table[0, a.Lper, a.Uorg] == 1 - assert table[0, a.Lper, a.O] == 1 - - assert table[0, a.Lloc, a.Bper] == 1 - assert table[0, a.Lloc, a.Bloc] == 1 - assert table[0, a.Lloc, a.Borg] == 1 - assert table[0, a.Lloc, a.Iper] == 0 - assert table[0, a.Lloc, a.Iloc] == 0 - assert table[0, a.Lloc, a.Iorg] == 0 - assert table[0, a.Lloc, a.Lper] == 0 - assert table[0, a.Lloc, a.Lloc] == 0 - assert table[0, a.Lloc, a.Lorg] == 0 - assert table[0, a.Lloc, a.Uper] == 1 - assert table[0, a.Lloc, a.Uloc] == 1 - assert table[0, a.Lloc, a.Uorg] == 1 - assert table[0, a.Lloc, a.O] == 1 - - assert table[0, a.Lorg, a.Bper] == 1 - assert table[0, a.Lorg, a.Bloc] == 1 - assert table[0, a.Lorg, a.Borg] == 1 - assert table[0, a.Lorg, a.Iper] == 0 - assert table[0, a.Lorg, a.Iloc] == 0 - assert table[0, a.Lorg, a.Iorg] == 0 - assert table[0, a.Lorg, a.Lper] == 0 - assert table[0, a.Lorg, a.Lloc] == 0 - assert table[0, a.Lorg, a.Lorg] == 0 - assert table[0, a.Lorg, a.Uper] == 1 - assert table[0, a.Lorg, a.Uloc] == 1 - assert table[0, a.Lorg, a.Uorg] == 1 - assert table[0, a.Lorg, a.O] == 1 - - # Not last token, prev action was U - assert table[0, a.Uper, a.Bper] == 1 - assert table[0, a.Uper, a.Bloc] == 1 - assert table[0, a.Uper, a.Borg] == 1 - assert table[0, a.Uper, a.Iper] == 0 - assert table[0, a.Uper, a.Iloc] == 0 - assert table[0, a.Uper, a.Iorg] == 0 - assert table[0, a.Uper, a.Lper] == 0 - assert table[0, a.Uper, a.Lloc] == 0 - assert table[0, a.Uper, a.Lorg] == 0 - assert table[0, a.Uper, a.Uper] == 1 - assert table[0, a.Uper, a.Uloc] == 1 - assert table[0, a.Uper, a.Uorg] == 1 - assert table[0, a.Uper, a.O] == 1 - - assert table[0, a.Uloc, a.Bper] == 1 - assert table[0, a.Uloc, a.Bloc] == 1 - assert table[0, a.Uloc, a.Borg] == 1 - assert table[0, a.Uloc, a.Iper] == 0 - assert table[0, a.Uloc, a.Iloc] == 0 - assert table[0, a.Uloc, a.Iorg] == 0 - assert table[0, a.Uloc, a.Lper] == 0 - assert table[0, a.Uloc, a.Lloc] == 0 - assert table[0, a.Uloc, a.Lorg] == 0 - assert table[0, a.Uloc, a.Uper] == 1 - assert table[0, a.Uloc, a.Uloc] == 1 - assert table[0, a.Uloc, a.Uorg] == 1 - assert table[0, a.Uloc, a.O] == 1 - - assert table[0, a.Uorg, a.Bper] == 1 - assert table[0, a.Uorg, a.Bloc] == 1 - assert table[0, a.Uorg, a.Borg] == 1 - assert table[0, a.Uorg, a.Iper] == 0 - assert table[0, a.Uorg, a.Iloc] == 0 - assert table[0, a.Uorg, a.Iorg] == 0 - assert table[0, a.Uorg, a.Lper] == 0 - assert table[0, a.Uorg, a.Lloc] == 0 - assert table[0, a.Uorg, a.Lorg] == 0 - assert table[0, a.Uorg, a.Uper] == 1 - assert table[0, a.Uorg, a.Uloc] == 1 - assert table[0, a.Uorg, a.Uorg] == 1 - assert table[0, a.Uorg, a.O] == 1 - - # Not last token, prev action was O - assert table[0, a.O, a.Bper] == 1 - assert table[0, a.O, a.Bloc] == 1 - assert table[0, a.O, a.Borg] == 1 - assert table[0, a.O, a.Iper] == 0 - assert table[0, a.O, a.Iloc] == 0 - assert table[0, a.O, a.Iorg] == 0 - assert table[0, a.O, a.Lper] == 0 - assert table[0, a.O, a.Lloc] == 0 - assert table[0, a.O, a.Lorg] == 0 - assert table[0, a.O, a.Uper] == 1 - assert table[0, a.O, a.Uloc] == 1 - assert table[0, a.O, a.Uorg] == 1 - assert table[0, a.O, a.O] == 1 - - # Last token, prev action was B - assert table[1, a.Bper, a.Bper] == 0 - assert table[1, a.Bper, a.Bloc] == 0 - assert table[1, a.Bper, a.Borg] == 0 - assert table[1, a.Bper, a.Iper] == 0 - assert table[1, a.Bper, a.Iloc] == 0 - assert table[1, a.Bper, a.Iorg] == 0 - assert table[1, a.Bper, a.Lper] == 1 - assert table[1, a.Bper, a.Lloc] == 0 - assert table[1, a.Bper, a.Lorg] == 0 - assert table[1, a.Bper, a.Uper] == 0 - assert table[1, a.Bper, a.Uloc] == 0 - assert table[1, a.Bper, a.Uorg] == 0 - assert table[1, a.Bper, a.O] == 0 - - assert table[1, a.Bloc, a.Bper] == 0 - assert table[1, a.Bloc, a.Bloc] == 0 - assert table[0, a.Bloc, a.Borg] == 0 - assert table[1, a.Bloc, a.Iper] == 0 - assert table[1, a.Bloc, a.Iloc] == 0 - assert table[1, a.Bloc, a.Iorg] == 0 - assert table[1, a.Bloc, a.Lper] == 0 - assert table[1, a.Bloc, a.Lloc] == 1 - assert table[1, a.Bloc, a.Lorg] == 0 - assert table[1, a.Bloc, a.Uper] == 0 - assert table[1, a.Bloc, a.Uloc] == 0 - assert table[1, a.Bloc, a.Uorg] == 0 - assert table[1, a.Bloc, a.O] == 0 - - assert table[1, a.Borg, a.Bper] == 0 - assert table[1, a.Borg, a.Bloc] == 0 - assert table[1, a.Borg, a.Borg] == 0 - assert table[1, a.Borg, a.Iper] == 0 - assert table[1, a.Borg, a.Iloc] == 0 - assert table[1, a.Borg, a.Iorg] == 0 - assert table[1, a.Borg, a.Lper] == 0 - assert table[1, a.Borg, a.Lloc] == 0 - assert table[1, a.Borg, a.Lorg] == 1 - assert table[1, a.Borg, a.Uper] == 0 - assert table[1, a.Borg, a.Uloc] == 0 - assert table[1, a.Borg, a.Uorg] == 0 - assert table[1, a.Borg, a.O] == 0 - - # Last token, prev action was I - assert table[1, a.Iper, a.Bper] == 0 - assert table[1, a.Iper, a.Bloc] == 0 - assert table[1, a.Iper, a.Borg] == 0 - assert table[1, a.Iper, a.Iper] == 0 - assert table[1, a.Iper, a.Iloc] == 0 - assert table[1, a.Iper, a.Iorg] == 0 - assert table[1, a.Iper, a.Lper] == 1 - assert table[1, a.Iper, a.Lloc] == 0 - assert table[1, a.Iper, a.Lorg] == 0 - assert table[1, a.Iper, a.Uper] == 0 - assert table[1, a.Iper, a.Uloc] == 0 - assert table[1, a.Iper, a.Uorg] == 0 - assert table[1, a.Iper, a.O] == 0 - - assert table[1, a.Iloc, a.Bper] == 0 - assert table[1, a.Iloc, a.Bloc] == 0 - assert table[1, a.Iloc, a.Borg] == 0 - assert table[1, a.Iloc, a.Iper] == 0 - assert table[1, a.Iloc, a.Iloc] == 0 - assert table[1, a.Iloc, a.Iorg] == 0 - assert table[1, a.Iloc, a.Lper] == 0 - assert table[1, a.Iloc, a.Lloc] == 1 - assert table[1, a.Iloc, a.Lorg] == 0 - assert table[1, a.Iloc, a.Uper] == 0 - assert table[1, a.Iloc, a.Uloc] == 0 - assert table[1, a.Iloc, a.Uorg] == 0 - assert table[1, a.Iloc, a.O] == 0 - - assert table[1, a.Iorg, a.Bper] == 0 - assert table[1, a.Iorg, a.Bloc] == 0 - assert table[1, a.Iorg, a.Borg] == 0 - assert table[1, a.Iorg, a.Iper] == 0 - assert table[1, a.Iorg, a.Iloc] == 0 - assert table[1, a.Iorg, a.Iorg] == 0 - assert table[1, a.Iorg, a.Lper] == 0 - assert table[1, a.Iorg, a.Lloc] == 0 - assert table[1, a.Iorg, a.Lorg] == 1 - assert table[1, a.Iorg, a.Uper] == 0 - assert table[1, a.Iorg, a.Uloc] == 0 - assert table[1, a.Iorg, a.Uorg] == 0 - assert table[1, a.Iorg, a.O] == 0 - - # Last token, prev action was L - assert table[1, a.Lper, a.Bper] == 0 - assert table[1, a.Lper, a.Bloc] == 0 - assert table[1, a.Lper, a.Borg] == 0 - assert table[1, a.Lper, a.Iper] == 0 - assert table[1, a.Lper, a.Iloc] == 0 - assert table[1, a.Lper, a.Iorg] == 0 - assert table[1, a.Lper, a.Lper] == 0 - assert table[1, a.Lper, a.Lloc] == 0 - assert table[1, a.Lper, a.Lorg] == 0 - assert table[1, a.Lper, a.Uper] == 1 - assert table[1, a.Lper, a.Uloc] == 1 - assert table[1, a.Lper, a.Uorg] == 1 - assert table[1, a.Lper, a.O] == 1 - - assert table[1, a.Lloc, a.Bper] == 0 - assert table[1, a.Lloc, a.Bloc] == 0 - assert table[1, a.Lloc, a.Borg] == 0 - assert table[1, a.Lloc, a.Iper] == 0 - assert table[1, a.Lloc, a.Iloc] == 0 - assert table[1, a.Lloc, a.Iorg] == 0 - assert table[1, a.Lloc, a.Lper] == 0 - assert table[1, a.Lloc, a.Lloc] == 0 - assert table[1, a.Lloc, a.Lorg] == 0 - assert table[1, a.Lloc, a.Uper] == 1 - assert table[1, a.Lloc, a.Uloc] == 1 - assert table[1, a.Lloc, a.Uorg] == 1 - assert table[1, a.Lloc, a.O] == 1 - - assert table[1, a.Lorg, a.Bper] == 0 - assert table[1, a.Lorg, a.Bloc] == 0 - assert table[1, a.Lorg, a.Borg] == 0 - assert table[1, a.Lorg, a.Iper] == 0 - assert table[1, a.Lorg, a.Iloc] == 0 - assert table[1, a.Lorg, a.Iorg] == 0 - assert table[1, a.Lorg, a.Lper] == 0 - assert table[1, a.Lorg, a.Lloc] == 0 - assert table[1, a.Lorg, a.Lorg] == 0 - assert table[1, a.Lorg, a.Uper] == 1 - assert table[1, a.Lorg, a.Uloc] == 1 - assert table[1, a.Lorg, a.Uorg] == 1 - assert table[1, a.Lorg, a.O] == 1 - - # Last token, prev action was U - assert table[1, a.Uper, a.Bper] == 0 - assert table[1, a.Uper, a.Bloc] == 0 - assert table[1, a.Uper, a.Borg] == 0 - assert table[1, a.Uper, a.Iper] == 0 - assert table[1, a.Uper, a.Iloc] == 0 - assert table[1, a.Uper, a.Iorg] == 0 - assert table[1, a.Uper, a.Lper] == 0 - assert table[1, a.Uper, a.Lloc] == 0 - assert table[1, a.Uper, a.Lorg] == 0 - assert table[1, a.Uper, a.Uper] == 1 - assert table[1, a.Uper, a.Uloc] == 1 - assert table[1, a.Uper, a.Uorg] == 1 - assert table[1, a.Uper, a.O] == 1 - - assert table[1, a.Uloc, a.Bper] == 0 - assert table[1, a.Uloc, a.Bloc] == 0 - assert table[1, a.Uloc, a.Borg] == 0 - assert table[1, a.Uloc, a.Iper] == 0 - assert table[1, a.Uloc, a.Iloc] == 0 - assert table[1, a.Uloc, a.Iorg] == 0 - assert table[1, a.Uloc, a.Lper] == 0 - assert table[1, a.Uloc, a.Lloc] == 0 - assert table[1, a.Uloc, a.Lorg] == 0 - assert table[1, a.Uloc, a.Uper] == 1 - assert table[1, a.Uloc, a.Uloc] == 1 - assert table[1, a.Uloc, a.Uorg] == 1 - assert table[1, a.Uloc, a.O] == 1 - - assert table[1, a.Uorg, a.Bper] == 0 - assert table[1, a.Uorg, a.Bloc] == 0 - assert table[1, a.Uorg, a.Borg] == 0 - assert table[1, a.Uorg, a.Iper] == 0 - assert table[1, a.Uorg, a.Iloc] == 0 - assert table[1, a.Uorg, a.Iorg] == 0 - assert table[1, a.Uorg, a.Lper] == 0 - assert table[1, a.Uorg, a.Lloc] == 0 - assert table[1, a.Uorg, a.Lorg] == 0 - assert table[1, a.Uorg, a.Uper] == 1 - assert table[1, a.Uorg, a.Uloc] == 1 - assert table[1, a.Uorg, a.Uorg] == 1 - assert table[1, a.Uorg, a.O] == 1 - - # Last token, prev action was O - assert table[1, a.O, a.Bper] == 0 - assert table[1, a.O, a.Bloc] == 0 - assert table[1, a.O, a.Borg] == 0 - assert table[1, a.O, a.Iper] == 0 - assert table[1, a.O, a.Iloc] == 0 - assert table[1, a.O, a.Iorg] == 0 - assert table[1, a.O, a.Lper] == 0 - assert table[1, a.O, a.Lloc] == 0 - assert table[1, a.O, a.Lorg] == 0 - assert table[1, a.O, a.Uper] == 1 - assert table[1, a.O, a.Uloc] == 1 - assert table[1, a.O, a.Uorg] == 1 - assert table[1, a.O, a.O] == 1 + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2(test_text) + ents2 = doc2.ents + assert len(ents2) == 1 + assert ents2[0].text == "London" + assert ents2[0].label_ == "LOC" diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index d5a549f13..41384897a 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -117,9 +117,7 @@ def test_overfitting_IO(): assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1) # Test scoring - scores = nlp.evaluate( - train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}} - ) + scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"}) assert scores["cats_f"] == 1.0 assert scores["cats_score"] == 1.0 assert "cats_score_desc" in scores diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 636cddcb7..27464a39a 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -438,9 +438,8 @@ def test_issue4402(): data = DocBin(docs=docs, attrs=attrs).to_bytes() with output_file.open("wb") as file_: file_.write(data) - corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - - train_data = list(corpus.train_dataset(nlp)) + reader = Corpus(output_file) + train_data = list(reader(nlp)) assert len(train_data) == 2 split_train_data = [] diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index 08a21e690..0b3b4a9fc 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -139,7 +139,8 @@ def test_issue4665(): def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() - kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb = KnowledgeBase(entity_vector_length=3) + kb.initialize(nlp.vocab) vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] with pytest.warns(UserWarning): @@ -156,7 +157,8 @@ def test_issue4674(): dir_path.mkdir() file_path = dir_path / "kb" kb.dump(str(file_path)) - kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) + kb2 = KnowledgeBase(entity_vector_length=3) + kb2.initialize(nlp.vocab) kb2.load_bulk(str(file_path)) assert kb2.get_size_entities() == 1 diff --git a/spacy/tests/regression/test_issue5137.py b/spacy/tests/regression/test_issue5137.py index 095ca8495..cc7a9bd38 100644 --- a/spacy/tests/regression/test_issue5137.py +++ b/spacy/tests/regression/test_issue5137.py @@ -27,6 +27,6 @@ def test_issue5137(): with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) - overrides = {"my_component": {"categories": "my_categories"}} - nlp2 = spacy.load(tmpdir, component_cfg=overrides) + overrides = {"components": {"my_component": {"categories": "my_categories"}}} + nlp2 = spacy.load(tmpdir, config=overrides) assert nlp2.get_pipe("my_component").categories == "my_categories" diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index ae9ed1844..31292b700 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -72,7 +72,8 @@ def entity_linker(): @registry.assets.register("TestIssue5230KB.v1") def dummy_kb() -> KnowledgeBase: - kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb = KnowledgeBase(entity_vector_length=1) + kb.initialize(nlp.vocab) kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) return kb @@ -121,7 +122,8 @@ def test_writer_with_path_py35(): def test_save_and_load_knowledge_base(): nlp = Language() - kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb = KnowledgeBase(entity_vector_length=1) + kb.initialize(nlp.vocab) with make_tempdir() as d: path = d / "kb" try: @@ -130,7 +132,8 @@ def test_save_and_load_knowledge_base(): pytest.fail(str(e)) try: - kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb_loaded = KnowledgeBase(entity_vector_length=1) + kb_loaded.initialize(nlp.vocab) kb_loaded.load_bulk(path) except Exception as e: pytest.fail(str(e)) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index ce35add42..0d3c90c92 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -2,6 +2,7 @@ import pytest from thinc.config import Config, ConfigValidationError import spacy from spacy.lang.en import English +from spacy.lang.de import German from spacy.language import Language from spacy.util import registry, deep_merge_configs, load_model_from_config from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model @@ -11,8 +12,23 @@ from ..util import make_tempdir nlp_config_string = """ +[paths] +train = "" +dev = "" + [training] -batch_size = 666 + +[training.train_corpus] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + +[training.dev_corpus] +@readers = "spacy.Corpus.v1" +path = ${paths:dev} + +[training.batcher] +@batchers = "batch_by_words.v1" +size = 666 [nlp] lang = "en" @@ -73,14 +89,9 @@ def my_parser(): width=321, rows=5432, also_embed_subwords=True, - also_use_static_vectors=False + also_use_static_vectors=False, ), - MaxoutWindowEncoder( - width=321, - window_size=3, - maxout_pieces=4, - depth=2 - ) + MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), ) parser = build_tb_parser_model( tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 @@ -93,7 +104,7 @@ def test_create_nlp_from_config(): with pytest.raises(ConfigValidationError): nlp, _ = load_model_from_config(config, auto_fill=False) nlp, resolved = load_model_from_config(config, auto_fill=True) - assert nlp.config["training"]["batch_size"] == 666 + assert nlp.config["training"]["batcher"]["size"] == 666 assert len(nlp.config["training"]) > 1 assert nlp.pipe_names == ["tok2vec", "tagger"] assert len(nlp.config["components"]) == 2 @@ -272,3 +283,33 @@ def test_serialize_config_missing_pipes(): assert "tok2vec" not in config["components"] with pytest.raises(ValueError): load_model_from_config(config, auto_fill=True) + + +def test_config_overrides(): + overrides_nested = {"nlp": {"lang": "de", "pipeline": ["tagger"]}} + overrides_dot = {"nlp.lang": "de", "nlp.pipeline": ["tagger"]} + # load_model from config with overrides passed directly to Config + config = Config().from_str(nlp_config_string, overrides=overrides_dot) + nlp, _ = load_model_from_config(config, auto_fill=True) + assert isinstance(nlp, German) + assert nlp.pipe_names == ["tagger"] + # Serialized roundtrip with config passed in + base_config = Config().from_str(nlp_config_string) + base_nlp, _ = load_model_from_config(base_config, auto_fill=True) + assert isinstance(base_nlp, English) + assert base_nlp.pipe_names == ["tok2vec", "tagger"] + with make_tempdir() as d: + base_nlp.to_disk(d) + nlp = spacy.load(d, config=overrides_nested) + assert isinstance(nlp, German) + assert nlp.pipe_names == ["tagger"] + with make_tempdir() as d: + base_nlp.to_disk(d) + nlp = spacy.load(d, config=overrides_dot) + assert isinstance(nlp, German) + assert nlp.pipe_names == ["tagger"] + with make_tempdir() as d: + base_nlp.to_disk(d) + nlp = spacy.load(d) + assert isinstance(nlp, English) + assert nlp.pipe_names == ["tok2vec", "tagger"] diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index a547b51bc..4a976fc02 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -1,5 +1,4 @@ import spacy -import pytest from spacy.lang.en import English from spacy.tokens import Doc, DocBin diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 91036a496..3f33c6f06 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -17,7 +17,8 @@ def test_serialize_kb_disk(en_vocab): file_path = dir_path / "kb" kb1.dump(str(file_path)) - kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3) + kb2 = KnowledgeBase(entity_vector_length=3) + kb2.initialize(en_vocab) kb2.load_bulk(str(file_path)) # final assertions @@ -25,7 +26,8 @@ def test_serialize_kb_disk(en_vocab): def _get_dummy_kb(vocab): - kb = KnowledgeBase(vocab=vocab, entity_vector_length=3) + kb = KnowledgeBase(entity_vector_length=3) + kb.initialize(vocab) kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3]) kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0]) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index c44daf630..16974a4c2 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,5 +1,5 @@ import numpy -from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags +from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment from spacy.gold import spans_from_biluo_tags, iob_to_biluo from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example @@ -483,14 +483,14 @@ def test_roundtrip_docs_to_docbin(doc): reloaded_nlp = English() json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = Corpus(str(json_file), str(json_file)) output_file = tmpdir / "roundtrip.spacy" data = DocBin(docs=[doc]).to_bytes() with output_file.open("wb") as file_: file_.write(data) - goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - reloaded_example = next(goldcorpus.dev_dataset(nlp=reloaded_nlp)) - assert len(doc) == goldcorpus.count_train(reloaded_nlp) + reader = Corpus(output_file) + reloaded_examples = list(reader(reloaded_nlp)) + assert len(doc) == sum(len(eg) for eg in reloaded_examples) + reloaded_example = reloaded_examples[0] assert text == reloaded_example.reference.text assert idx == [t.idx for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference] @@ -515,10 +515,9 @@ def test_make_orth_variants(doc): data = DocBin(docs=[doc]).to_bytes() with output_file.open("wb") as file_: file_.write(data) - goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - # due to randomness, test only that this runs with no errors for now - train_example = next(goldcorpus.train_dataset(nlp)) + reader = Corpus(output_file) + train_example = next(reader(nlp)) make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) @@ -647,11 +646,83 @@ def test_split_sents(merged_dict): assert split_examples[1].text == "It is just me" token_annotation_1 = split_examples[0].to_dict()["token_annotation"] - assert token_annotation_1["words"] == ["Hi", "there", "everyone"] - assert token_annotation_1["tags"] == ["INTJ", "ADV", "PRON"] - assert token_annotation_1["sent_starts"] == [1, 0, 0] + assert token_annotation_1["ORTH"] == ["Hi", "there", "everyone"] + assert token_annotation_1["TAG"] == ["INTJ", "ADV", "PRON"] + assert token_annotation_1["SENT_START"] == [1, 0, 0] token_annotation_2 = split_examples[1].to_dict()["token_annotation"] - assert token_annotation_2["words"] == ["It", "is", "just", "me"] - assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"] - assert token_annotation_2["sent_starts"] == [1, 0, 0, 0] + assert token_annotation_2["ORTH"] == ["It", "is", "just", "me"] + assert token_annotation_2["TAG"] == ["PRON", "AUX", "ADV", "PRON"] + assert token_annotation_2["SENT_START"] == [1, 0, 0, 0] + + +def test_alignment(): + other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1] + assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7] + + +def test_alignment_case_insensitive(): + other_tokens = ["I", "listened", "to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "Obama", "'s", "PODCASTS", "."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [1, 1, 1, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 6] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 1, 1] + assert list(align.y2x.dataXd) == [0, 1, 2, 3, 4, 5, 6, 7] + + +def test_alignment_complex(): + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + align = Alignment.from_strings(other_tokens, spacy_tokens) + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5] + + +def test_alignment_complex_example(en_vocab): + other_tokens = ["i listened to", "obama", "'", "s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] + predicted = Doc( + en_vocab, words=other_tokens, spaces=[True, False, False, True, False, False] + ) + reference = Doc( + en_vocab, words=spacy_tokens, spaces=[True, True, True, False, True, False] + ) + assert predicted.text == "i listened to obama's podcasts." + assert reference.text == "i listened to obama's podcasts." + example = Example(predicted, reference) + align = example.alignment + assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1] + assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] + assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] + assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5] + + +def test_alignment_different_texts(): + other_tokens = ["she", "listened", "to", "obama", "'s", "podcasts", "."] + spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] + with pytest.raises(ValueError): + Alignment.from_strings(other_tokens, spacy_tokens) + + +def test_retokenized_docs(doc): + a = doc.to_array(["TAG"]) + doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) + doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) + example = Example(doc1, doc2) + # fmt: off + expected1 = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."] + expected2 = [None, "sister", "flew", "to", None, "via", "London", "."] + # fmt: on + assert example.get_aligned("ORTH", as_string=True) == expected1 + with doc1.retokenize() as retokenizer: + retokenizer.merge(doc1[0:2]) + retokenizer.merge(doc1[5:7]) + assert example.get_aligned("ORTH", as_string=True) == expected2 diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index a63a8e24c..6865cd1e5 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -3,10 +3,11 @@ import pytest from spacy.language import Language from spacy.tokens import Doc, Span from spacy.vocab import Vocab +from spacy.gold import Example from spacy.lang.en import English +from spacy.util import registry from .util import add_vecs_to_vocab, assert_docs_equal -from ..gold import Example @pytest.fixture @@ -153,6 +154,85 @@ def test_language_pipe_stream(nlp2, n_process, texts): assert_docs_equal(doc, expected_doc) -def test_language_from_config(): - English.from_config() - # TODO: add more tests +def test_language_from_config_before_after_init(): + name = "test_language_from_config_before_after_init" + ran_before = False + ran_after = False + ran_after_pipeline = False + + @registry.callbacks(f"{name}_before") + def make_before_creation(): + def before_creation(lang_cls): + nonlocal ran_before + ran_before = True + assert lang_cls is English + lang_cls.Defaults.foo = "bar" + return lang_cls + + return before_creation + + @registry.callbacks(f"{name}_after") + def make_after_creation(): + def after_creation(nlp): + nonlocal ran_after + ran_after = True + assert isinstance(nlp, English) + assert nlp.pipe_names == [] + assert nlp.Defaults.foo == "bar" + nlp.meta["foo"] = "bar" + return nlp + + return after_creation + + @registry.callbacks(f"{name}_after_pipeline") + def make_after_pipeline_creation(): + def after_pipeline_creation(nlp): + nonlocal ran_after_pipeline + ran_after_pipeline = True + assert isinstance(nlp, English) + assert nlp.pipe_names == ["sentencizer"] + assert nlp.Defaults.foo == "bar" + assert nlp.meta["foo"] == "bar" + nlp.meta["bar"] = "baz" + return nlp + + return after_pipeline_creation + + config = { + "nlp": { + "pipeline": ["sentencizer"], + "before_creation": {"@callbacks": f"{name}_before"}, + "after_creation": {"@callbacks": f"{name}_after"}, + "after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"}, + }, + "components": {"sentencizer": {"factory": "sentencizer"}}, + } + nlp = English.from_config(config) + assert all([ran_before, ran_after, ran_after_pipeline]) + assert nlp.Defaults.foo == "bar" + assert nlp.meta["foo"] == "bar" + assert nlp.meta["bar"] == "baz" + assert nlp.pipe_names == ["sentencizer"] + assert nlp("text") + + +def test_language_from_config_before_after_init_invalid(): + """Check that an error is raised if function doesn't return nlp.""" + name = "test_language_from_config_before_after_init_invalid" + registry.callbacks(f"{name}_before1", func=lambda: lambda nlp: None) + registry.callbacks(f"{name}_before2", func=lambda: lambda nlp: nlp()) + registry.callbacks(f"{name}_after1", func=lambda: lambda nlp: None) + registry.callbacks(f"{name}_after1", func=lambda: lambda nlp: English) + + for callback_name in [f"{name}_before1", f"{name}_before2"]: + config = {"nlp": {"before_creation": {"@callbacks": callback_name}}} + with pytest.raises(ValueError): + English.from_config(config) + for callback_name in [f"{name}_after1", f"{name}_after2"]: + config = {"nlp": {"after_creation": {"@callbacks": callback_name}}} + with pytest.raises(ValueError): + English.from_config(config) + for callback_name in [f"{name}_after1", f"{name}_after2"]: + config = {"nlp": {"after_pipeline_creation": {"@callbacks": callback_name}}} + with pytest.raises(ValueError): + English.from_config(config) diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 4c38ea6c6..8f1bb1c3d 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -24,6 +24,7 @@ def get_textcat_kwargs(): "nO": 7, } + def get_textcat_cnn_kwargs(): return { "tok2vec": test_tok2vec(), @@ -31,6 +32,7 @@ def get_textcat_cnn_kwargs(): "nO": 13, } + def get_all_params(model): params = [] for node in model.walk(): @@ -59,17 +61,11 @@ def get_tok2vec_kwargs(): # This actually creates models, so seems best to put it in a function. return { "embed": MultiHashEmbed( - width=32, - rows=500, - also_embed_subwords=True, - also_use_static_vectors=False + width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False ), "encode": MaxoutWindowEncoder( - width=32, - depth=2, - maxout_pieces=2, - window_size=1, - ) + width=32, depth=2, maxout_pieces=2, window_size=1, + ), } diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 886a24a8e..df6489aa8 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -42,7 +42,7 @@ def test_Example_from_dict_with_tags(pred_words, annots): example = Example.from_dict(predicted, annots) for i, token in enumerate(example.reference): assert token.tag_ == annots["tags"][i] - aligned_tags = example.get_aligned("tag", as_string=True) + aligned_tags = example.get_aligned("TAG", as_string=True) assert aligned_tags == ["NN" for _ in predicted] @@ -53,9 +53,13 @@ def test_aligned_tags(): annots = {"words": gold_words, "tags": gold_tags} vocab = Vocab() predicted = Doc(vocab, words=pred_words) - example = Example.from_dict(predicted, annots) - aligned_tags = example.get_aligned("tag", as_string=True) - assert aligned_tags == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"] + example1 = Example.from_dict(predicted, annots) + aligned_tags1 = example1.get_aligned("TAG", as_string=True) + assert aligned_tags1 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"] + # ensure that to_dict works correctly + example2 = Example.from_dict(predicted, example1.to_dict()) + aligned_tags2 = example2.get_aligned("TAG", as_string=True) + assert aligned_tags2 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"] def test_aligned_tags_multi(): @@ -66,7 +70,7 @@ def test_aligned_tags_multi(): vocab = Vocab() predicted = Doc(vocab, words=pred_words) example = Example.from_dict(predicted, annots) - aligned_tags = example.get_aligned("tag", as_string=True) + aligned_tags = example.get_aligned("TAG", as_string=True) assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"] diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 76b5e64df..b30705088 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -19,14 +19,9 @@ def test_empty_doc(): width=width, rows=embed_size, also_use_static_vectors=False, - also_embed_subwords=True + also_embed_subwords=True, ), - MaxoutWindowEncoder( - width=width, - depth=4, - window_size=1, - maxout_pieces=3 - ) + MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3), ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update([doc]) @@ -44,14 +39,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): width=width, rows=embed_size, also_use_static_vectors=False, - also_embed_subwords=True + also_embed_subwords=True, ), - MaxoutWindowEncoder( - width=width, - depth=4, - window_size=1, - maxout_pieces=3, - ) + MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3,), ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 3a6c0fd95..47111a902 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -3,8 +3,9 @@ import pytest from .util import get_random_doc from spacy import util -from spacy.util import minibatch_by_words, dot_to_object +from spacy.util import dot_to_object from thinc.api import Config, Optimizer +from spacy.gold.batchers import minibatch_by_words from ..lang.en import English from ..lang.nl import Dutch @@ -84,27 +85,24 @@ def test_util_dot_section(): """ nlp_config = Config().from_str(cfg_string) en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True) - default_config = Config().from_disk(DEFAULT_CONFIG_PATH) default_config["nlp"]["lang"] = "nl" nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True) - # Test that creation went OK assert isinstance(en_nlp, English) assert isinstance(nl_nlp, Dutch) assert nl_nlp.pipe_names == [] assert en_nlp.pipe_names == ["textcat"] - assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] == False # not exclusive_classes - + # not exclusive_classes + assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False # Test that default values got overwritten assert not en_config["nlp"]["load_vocab_data"] assert nl_config["nlp"]["load_vocab_data"] # default value True - # Test proper functioning of 'dot_to_object' with pytest.raises(KeyError): - obj = dot_to_object(en_config, "nlp.pipeline.tagger") + dot_to_object(en_config, "nlp.pipeline.tagger") with pytest.raises(KeyError): - obj = dot_to_object(en_config, "nlp.unknownattribute") + dot_to_object(en_config, "nlp.unknownattribute") assert not dot_to_object(en_config, "nlp.load_vocab_data") assert dot_to_object(nl_config, "nlp.load_vocab_data") assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index b89ce3bdd..61f7c3db0 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -12,6 +12,7 @@ from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC from ..attrs cimport TAG, MORPH +from ..vocab cimport Vocab from .underscore import is_writable_attr from ..attrs import intify_attrs @@ -57,16 +58,7 @@ cdef class Retokenizer: raise ValueError(Errors.E102.format(token=repr(token))) self.tokens_to_merge.add(token.i) self._spans_to_merge.append((span.start, span.end)) - if "_" in attrs: # Extension attributes - extensions = attrs["_"] - _validate_extensions(extensions) - attrs = {key: value for key, value in attrs.items() if key != "_"} - attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) - attrs["_"] = extensions - else: - attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) - if MORPH in attrs: - self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(attrs[MORPH])) + attrs = normalize_token_attrs(self.doc.vocab, attrs) self.merges.append((span, attrs)) def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()): @@ -98,9 +90,11 @@ cdef class Retokenizer: # NB: Since we support {"KEY": [value, value]} syntax here, this # will only "intify" the keys, not the values attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) - if MORPH in attrs: - for morph in attrs[MORPH]: - self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(morph)) + if MORPH in attrs: + for i, morph in enumerate(attrs[MORPH]): + # add and set to normalized value + morph = self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(morph)) + attrs[MORPH][i] = morph head_offsets = [] for head in heads: if isinstance(head, Token): @@ -224,21 +218,7 @@ def _merge(Doc doc, merges): token.lex = lex # We set trailing space here too token.spacy = doc.c[spans[token_index].end-1].spacy - py_token = span[0] - # Assign attributes - for attr_name, attr_value in attributes.items(): - if attr_name == "_": # Set extension attributes - for ext_attr_key, ext_attr_value in attr_value.items(): - py_token._.set(ext_attr_key, ext_attr_value) - elif attr_name == TAG: - doc.vocab.morphology.assign_tag(token, attr_value) - else: - # Set attributes on both token and lexeme to take care of token - # attribute vs. lexical attribute without having to enumerate - # them. If an attribute name is not valid, set_struct_attr will - # ignore it. - Token.set_struct_attr(token, attr_name, attr_value) - Lexeme.set_struct_attr(lex, attr_name, attr_value) + set_token_attrs(span[0], attributes) # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets # Before thinking of something simpler, beware the case where a @@ -423,3 +403,40 @@ cdef make_iob_consistent(TokenC* tokens, int length): for i in range(1, length): if tokens[i].ent_iob == 1 and tokens[i - 1].ent_type != tokens[i].ent_type: tokens[i].ent_iob = 3 + + +def normalize_token_attrs(Vocab vocab, attrs): + if "_" in attrs: # Extension attributes + extensions = attrs["_"] + print("EXTENSIONS", extensions) + _validate_extensions(extensions) + attrs = {key: value for key, value in attrs.items() if key != "_"} + attrs = intify_attrs(attrs, strings_map=vocab.strings) + attrs["_"] = extensions + else: + attrs = intify_attrs(attrs, strings_map=vocab.strings) + if MORPH in attrs: + # add and set to normalized value + morph = vocab.morphology.add(vocab.strings.as_string(attrs[MORPH])) + attrs[MORPH] = morph + return attrs + + +def set_token_attrs(Token py_token, attrs): + cdef TokenC* token = py_token.c + cdef const LexemeC* lex = token.lex + cdef Doc doc = py_token.doc + # Assign attributes + for attr_name, attr_value in attrs.items(): + if attr_name == "_": # Set extension attributes + for ext_attr_key, ext_attr_value in attr_value.items(): + py_token._.set(ext_attr_key, ext_attr_value) + elif attr_name == TAG: + doc.vocab.morphology.assign_tag(token, attr_value) + else: + # Set attributes on both token and lexeme to take care of token + # attribute vs. lexical attribute without having to enumerate + # them. If an attribute name is not valid, set_struct_attr will + # ignore it. + Token.set_struct_attr(token, attr_name, attr_value) + Lexeme.set_struct_attr(lex, attr_name, attr_value) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 5b55d8e88..15e6518d6 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -176,9 +176,13 @@ cdef class Span: return Span(self.doc, start + self.start, end + self.start) else: if i < 0: - return self.doc[self.end + i] + token_i = self.end + i else: - return self.doc[self.start + i] + token_i = self.start + i + if self.start <= token_i < self.end: + return self.doc[token_i] + else: + raise IndexError(Errors.E1002) def __iter__(self): """Iterate over `Token` objects. diff --git a/spacy/util.py b/spacy/util.py index 677f5e8e0..e580d6c62 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,5 +1,5 @@ from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple -from typing import Iterator, Type, Pattern, Sequence, TYPE_CHECKING +from typing import Iterator, Type, Pattern, TYPE_CHECKING from types import ModuleType import os import importlib @@ -7,7 +7,7 @@ import importlib.util import re from pathlib import Path import thinc -from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer, Model +from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer import functools import itertools import numpy.random @@ -24,8 +24,6 @@ import tempfile import shutil import shlex import inspect -from thinc.types import Unserializable - try: import cupy.random @@ -46,7 +44,7 @@ from thinc.api import fix_random_seed, compounding, decaying # noqa: F401 from .symbols import ORTH from .compat import cupy, CudaStream, is_windows -from .errors import Errors, Warnings +from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS from . import about if TYPE_CHECKING: @@ -69,6 +67,10 @@ class registry(thinc.registry): lookups = catalogue.create("spacy", "lookups", entry_points=True) displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) assets = catalogue.create("spacy", "assets", entry_points=True) + # Callback functions used to manipulate nlp object etc. + callbacks = catalogue.create("spacy", "callbacks") + batchers = catalogue.create("spacy", "batchers", entry_points=True) + readers = catalogue.create("spacy", "readers", entry_points=True) # These are factories registered via third-party packages and the # spacy_factories entry point. This registry only exists so we can easily # load them via the entry points. The "true" factories are added via the @@ -205,45 +207,55 @@ def load_vectors_into_model( def load_model( name: Union[str, Path], + *, + vocab: Union["Vocab", bool] = True, disable: Iterable[str] = tuple(), - component_cfg: Dict[str, Dict[str, Any]] = SimpleFrozenDict(), + config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from a package or data path. name (str): Package name or model path. + vocab (Vocab / True): Optional vocab to pass in on initialization. If True, + a new Vocab object will be created. disable (Iterable[str]): Names of pipeline components to disable. - component_cfg (Dict[str, dict]): Config overrides for pipeline components, - keyed by component names. + config (Dict[str, Any] / Config): Config overrides as nested dict or dict + keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. """ - cfg = component_cfg + kwargs = {"vocab": vocab, "disable": disable, "config": config} if isinstance(name, str): # name or string path if name.startswith("blank:"): # shortcut for blank model return get_lang_class(name.replace("blank:", ""))() if is_package(name): # installed as package - return load_model_from_package(name, disable=disable, component_cfg=cfg) + return load_model_from_package(name, **kwargs) if Path(name).exists(): # path to model data directory - return load_model_from_path(Path(name), disable=disable, component_cfg=cfg) + return load_model_from_path(Path(name), **kwargs) elif hasattr(name, "exists"): # Path or Path-like to model data - return load_model_from_path(name, disable=disable, component_cfg=cfg) + return load_model_from_path(name, **kwargs) + if name in OLD_MODEL_SHORTCUTS: + raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name])) raise IOError(Errors.E050.format(name=name)) def load_model_from_package( name: str, + *, + vocab: Union["Vocab", bool] = True, disable: Iterable[str] = tuple(), - component_cfg: Dict[str, Dict[str, Any]] = SimpleFrozenDict(), + config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from an installed package.""" cls = importlib.import_module(name) - return cls.load(disable=disable, component_cfg=component_cfg) + return cls.load(vocab=vocab, disable=disable, config=config) def load_model_from_path( model_path: Union[str, Path], + *, meta: Optional[Dict[str, Any]] = None, + vocab: Union["Vocab", bool] = True, disable: Iterable[str] = tuple(), - component_cfg: Dict[str, Dict[str, Any]] = SimpleFrozenDict(), + config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from a data directory path. Creates Language class with pipeline from config.cfg and then calls from_disk() with path.""" @@ -254,17 +266,16 @@ def load_model_from_path( config_path = model_path / "config.cfg" if not config_path.exists() or not config_path.is_file(): raise IOError(Errors.E053.format(path=config_path, name="config.cfg")) - config = Config().from_disk(config_path) - override_cfg = {"components": {p: dict_to_dot(c) for p, c in component_cfg.items()}} - overrides = dict_to_dot(override_cfg) - nlp, _ = load_model_from_config(config, disable=disable, overrides=overrides) + config = Config().from_disk(config_path, overrides=dict_to_dot(config)) + nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable) return nlp.from_disk(model_path, exclude=disable) def load_model_from_config( config: Union[Dict[str, Any], Config], + *, + vocab: Union["Vocab", bool] = True, disable: Iterable[str] = tuple(), - overrides: Dict[str, Any] = {}, auto_fill: bool = False, validate: bool = True, ) -> Tuple["Language", Config]: @@ -280,26 +291,20 @@ def load_model_from_config( # registry, including custom subclasses provided via entry points lang_cls = get_lang_class(nlp_config["lang"]) nlp = lang_cls.from_config( - config, - disable=disable, - overrides=overrides, - auto_fill=auto_fill, - validate=validate, + config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate, ) return nlp, nlp.resolved def load_model_from_init_py( init_file: Union[Path, str], + *, + vocab: Union["Vocab", bool] = True, disable: Iterable[str] = tuple(), - component_cfg: Dict[str, Dict[str, Any]] = SimpleFrozenDict(), + config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Helper function to use in the `load()` method of a model package's __init__.py. - - init_file (str): Path to model's __init__.py, i.e. `__file__`. - **overrides: Specific overrides, like pipeline components to disable. - RETURNS (Language): `Language` class with loaded model. """ model_path = Path(init_file).parent meta = get_model_meta(model_path) @@ -308,7 +313,7 @@ def load_model_from_init_py( if not model_path.exists(): raise IOError(Errors.E052.format(path=data_path)) return load_model_from_path( - data_path, meta, disable=disable, component_cfg=component_cfg + data_path, vocab=vocab, meta=meta, disable=disable, config=config ) @@ -749,145 +754,6 @@ def normalize_slice( return start, stop -def minibatch( - items: Iterable[Any], size: Union[Iterator[int], int] = 8 -) -> Iterator[Any]: - """Iterate over batches of items. `size` may be an iterator, - so that batch-size can vary on each step. - """ - if isinstance(size, int): - size_ = itertools.repeat(size) - else: - size_ = size - items = iter(items) - while True: - batch_size = next(size_) - batch = list(itertools.islice(items, int(batch_size))) - if len(batch) == 0: - break - yield list(batch) - - -def minibatch_by_padded_size( - docs: Iterator["Doc"], - size: Union[Iterator[int], int], - buffer: int = 256, - discard_oversize: bool = False, -) -> Iterator[Iterator["Doc"]]: - if isinstance(size, int): - size_ = itertools.repeat(size) - else: - size_ = size - for outer_batch in minibatch(docs, buffer): - outer_batch = list(outer_batch) - target_size = next(size_) - for indices in _batch_by_length(outer_batch, target_size): - subbatch = [outer_batch[i] for i in indices] - padded_size = max(len(seq) for seq in subbatch) * len(subbatch) - if discard_oversize and padded_size >= target_size: - pass - else: - yield subbatch - - -def _batch_by_length(seqs: Sequence[Any], max_words: int) -> List[List[Any]]: - """Given a list of sequences, return a batched list of indices into the - list, where the batches are grouped by length, in descending order. - - Batches may be at most max_words in size, defined as max sequence length * size. - """ - # Use negative index so we can get sort by position ascending. - lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)] - lengths_indices.sort() - batches = [] - batch = [] - for length, i in lengths_indices: - if not batch: - batch.append(i) - elif length * (len(batch) + 1) <= max_words: - batch.append(i) - else: - batches.append(batch) - batch = [i] - if batch: - batches.append(batch) - # Check lengths match - assert sum(len(b) for b in batches) == len(seqs) - batches = [list(sorted(batch)) for batch in batches] - batches.reverse() - return batches - - -def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): - """Create minibatches of roughly a given number of words. If any examples - are longer than the specified batch length, they will appear in a batch by - themselves, or be discarded if discard_oversize=True. - The argument 'docs' can be a list of strings, Doc's or Example's. """ - from .gold import Example - - if isinstance(size, int): - size_ = itertools.repeat(size) - elif isinstance(size, List): - size_ = iter(size) - else: - size_ = size - target_size = next(size_) - tol_size = target_size * tolerance - batch = [] - overflow = [] - batch_size = 0 - overflow_size = 0 - for doc in docs: - if isinstance(doc, Example): - n_words = len(doc.reference) - elif isinstance(doc, str): - n_words = len(doc.split()) - else: - n_words = len(doc) - # if the current example exceeds the maximum batch size, it is returned separately - # but only if discard_oversize=False. - if n_words > target_size + tol_size: - if not discard_oversize: - yield [doc] - # add the example to the current batch if there's no overflow yet and it still fits - elif overflow_size == 0 and (batch_size + n_words) <= target_size: - batch.append(doc) - batch_size += n_words - # add the example to the overflow buffer if it fits in the tolerance margin - elif (batch_size + overflow_size + n_words) <= (target_size + tol_size): - overflow.append(doc) - overflow_size += n_words - # yield the previous batch and start a new one. The new one gets the overflow examples. - else: - if batch: - yield batch - target_size = next(size_) - tol_size = target_size * tolerance - batch = overflow - batch_size = overflow_size - overflow = [] - overflow_size = 0 - # this example still fits - if (batch_size + n_words) <= target_size: - batch.append(doc) - batch_size += n_words - # this example fits in overflow - elif (batch_size + n_words) <= (target_size + tol_size): - overflow.append(doc) - overflow_size += n_words - # this example does not fit with the previous overflow: start another new batch - else: - if batch: - yield batch - target_size = next(size_) - tol_size = target_size * tolerance - batch = [doc] - batch_size = n_words - batch.extend(overflow) - if batch: - yield batch - - def filter_spans(spans: Iterable["Span"]) -> List["Span"]: """Filter a sequence of spans and remove duplicates or overlaps. Useful for creating named entities (where one token can only be part of one entity) or @@ -1219,3 +1085,20 @@ def create_default_optimizer() -> Optimizer: L2_is_weight_decay=L2_is_weight_decay, ) return optimizer + + +def minibatch(items, size): + """Iterate over batches of items. `size` may be an iterator, + so that batch-size can vary on each step. + """ + if isinstance(size, int): + size_ = itertools.repeat(size) + else: + size_ = size + items = iter(items) + while True: + batch_size = next(size_) + batch = list(itertools.islice(items, int(batch_size))) + if len(batch) == 0: + break + yield list(batch) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 95f7d0597..a22ee5be8 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -6,6 +6,7 @@ menu: - ['Tok2Vec', 'tok2vec'] - ['Transformers', 'transformers'] - ['Parser & NER', 'parser'] + - ['Tagging', 'tagger'] - ['Text Classification', 'textcat'] - ['Entity Linking', 'entitylinker'] --- @@ -18,6 +19,30 @@ TODO: intro and how architectures work, link to ### spacy.HashEmbedCNN.v1 {#HashEmbedCNN} + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.HashEmbedCNN.v1" +> # TODO: ... +> +> [model.tok2vec] +> # ... +> ``` + +| Name | Type | Description | +| -------------------- | ----- | ----------- | +| `width` | int | | +| `depth` | int | | +| `embed_size` | int | | +| `window_size` | int | | +| `maxout_pieces` | int | | +| `subword_features` | bool | | +| `dropout` | float | | +| `pretrained_vectors` | bool | | + ### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN} ### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM} @@ -99,6 +124,28 @@ architectures into your training config. | `use_upper` | bool | | | `nO` | int | | +## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} + +### spacy.Tagger.v1 {#Tagger} + + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.Tagger.v1" +> nO = null +> +> [model.tok2vec] +> # ... +> ``` + +| Name | Type | Description | +| --------- | ------------------------------------------ | ----------- | +| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | | +| `nO` | int | | + ## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"} ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} @@ -112,3 +159,21 @@ architectures into your training config. ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} ### spacy.EntityLinker.v1 {#EntityLinker} + + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.EntityLinker.v1" +> nO = null +> +> [model.tok2vec] +> # ... +> ``` + +| Name | Type | Description | +| --------- | ------------------------------------------ | ----------- | +| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | | +| `nO` | int | | diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 68aff4c46..abe050661 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -6,11 +6,11 @@ menu: - ['Download', 'download'] - ['Info', 'info'] - ['Validate', 'validate'] + - ['Init', 'init'] - ['Convert', 'convert'] - ['Debug', 'debug'] - ['Train', 'train'] - ['Pretrain', 'pretrain'] - - ['Init Model', 'init-model'] - ['Evaluate', 'evaluate'] - ['Package', 'package'] - ['Project', 'project'] @@ -94,6 +94,80 @@ $ python -m spacy validate | ---------- | -------- | --------------------------------------------------------- | | **PRINTS** | `stdout` | Details about the compatibility of your installed models. | +## Init {#init new="3"} + +The `spacy init` CLI includes helpful commands for initializing training config +files and model directories. + +### init config {#init-config new="3"} + +Initialize and export a [`config.cfg` file](/usage/training#config) for training +and update it with all default values, if possible. Config files used for +training should always be complete and not contain any hidden defaults or +missing values, so this command helps you create your final config. It takes +**one** of the following options: + +- `--base`: Base **config** to auto-fill, e.g. created using the + [training quickstart](/usage/training#quickstart) widget. +- `--lang`: Base **language** code to use for blank config. +- `--model`: Base **model** to copy config from. + +> ```bash +> ### with base config {wrap="true"} +> $ python -m spacy init config config.cfg --base base.cfg +> ``` +> +> ```bash +> ### blank language {wrap="true"} +> $ python -m spacy init config config.cfg --lang en --pipeline tagger,parser +> ``` + +```bash +$ python -m spacy init config [output] [--base] [--lang] [--model] [--pipeline] +``` + +| Argument | Type | Description | +| ------------------ | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `output` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. | +| `--base`, `-b` | option | Optional base config file to auto-fill with defaults. | +| `--lang`, `-l` | option | Optional language code to use for blank config. If a `--pipeline` is specified, the components will be added in order. | +| `--model`, `-m` | option | Optional base model to copy config from. If a `--pipeline` is specified, only those components will be kept, and all other components not in the model will be added. | +| `--pipeline`, `-p` | option | Optional comma-separate pipeline of components to add to blank language or model. | +| **CREATES** | config | Complete and auto-filled config file for training. | + +### init model {#init-model new="2"} + + + +Create a new model directory from raw data, like word frequencies, Brown +clusters and word vectors. This command is similar to the `spacy model` command +in v1.x. Note that in order to populate the model's vocab, you need to pass in a +JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as +`--jsonl-loc` with optional `id` values that correspond to the vectors table. +Just loading in vectors will not automatically populate the vocab. + + + +The `init-model` command is now available as a subcommand of `spacy init`. + + + +```bash +$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] +[--prune-vectors] +``` + +| Argument | Type | Description | +| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | + ## Convert {#convert} Convert files into spaCy's @@ -619,32 +693,6 @@ tokenization can be provided. {"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]} ``` -## Init Model {#init-model new="2"} - -Create a new model directory from raw data, like word frequencies, Brown -clusters and word vectors. This command is similar to the `spacy model` command -in v1.x. Note that in order to populate the model's vocab, you need to pass in a -JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as -`--jsonl-loc` with optional `id` values that correspond to the vectors table. -Just loading in vectors will not automatically populate the vocab. - -```bash -$ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] -[--prune-vectors] -``` - -| Argument | Type | Description | -| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| `--omit-extra-lookups`, `-OEL` 2.3 | flag | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | - ## Evaluate {#evaluate new="2"} diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 38e19129d..5f639d050 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -6,30 +6,44 @@ source: spacy/gold/corpus.py new: 3 --- -This class manages annotated corpora and can read training and development -datasets in the [DocBin](/api/docbin) (`.spacy`) format. +This class manages annotated corpora and can be used for training and +development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To +customize the data loading during training, you can register your own +[data readers and batchers](/usage/training#custom-code-readers-batchers) ## Corpus.\_\_init\_\_ {#init tag="method"} -Create a `Corpus`. The input data can be a file or a directory of files. +Create a `Corpus` for iterating [Example](/api/example) objects from a file or +directory of [`.spacy` data files](/api/data-formats#binary-training). The +`gold_preproc` setting lets you specify whether to set up the `Example` object +with gold-standard sentences and tokens for the predictions. Gold preprocessing +helps the annotations align to the tokenization, and may result in sequences of +more consistent length. However, it may reduce runtime accuracy due to +train/test skew. > #### Example > > ```python > from spacy.gold import Corpus > -> corpus = Corpus("./train.spacy", "./dev.spacy") +> # With a single file +> corpus = Corpus("./data/train.spacy") +> +> # With a directory +> corpus = Corpus("./data", limit=10) > ``` -| Name | Type | Description | -| ------- | ------------ | ---------------------------------------------------------------- | -| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). | -| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | -| `limit` | int | Maximum number of examples returned. `0` for no limit (default). | +| Name | Type | Description | +| --------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | The directory or filename to read from. | +| _keyword-only_ | | | +|  `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. | +| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. | +| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. | -## Corpus.train_dataset {#train_dataset tag="method"} +## Corpus.\_\_call\_\_ {#call tag="method"} -Yield examples from the training data. +Yield examples from the data. > #### Example > @@ -37,60 +51,12 @@ Yield examples from the training data. > from spacy.gold import Corpus > import spacy > -> corpus = Corpus("./train.spacy", "./dev.spacy") +> corpus = Corpus("./train.spacy") > nlp = spacy.blank("en") -> train_data = corpus.train_dataset(nlp) +> train_data = corpus(nlp) > ``` -| Name | Type | Description | -| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------ | -| `nlp` | `Language` | The current `nlp` object. | -| _keyword-only_ | | | -| `shuffle` | bool | Whether to shuffle the examples. Defaults to `True`. | -| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. | -| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. `0` for no limit (default).  | -| **YIELDS** | `Example` | The examples. | - -## Corpus.dev_dataset {#dev_dataset tag="method"} - -Yield examples from the development data. - -> #### Example -> -> ```python -> from spacy.gold import Corpus -> import spacy -> -> corpus = Corpus("./train.spacy", "./dev.spacy") -> nlp = spacy.blank("en") -> dev_data = corpus.dev_dataset(nlp) -> ``` - -| Name | Type | Description | -| -------------- | ---------- | ---------------------------------------------------------------------------- | -| `nlp` | `Language` | The current `nlp` object. | -| _keyword-only_ | | | -| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. | -| **YIELDS** | `Example` | The examples. | - -## Corpus.count_train {#count_train tag="method"} - -Get the word count of all training examples. - -> #### Example -> -> ```python -> from spacy.gold import Corpus -> import spacy -> -> corpus = Corpus("./train.spacy", "./dev.spacy") -> nlp = spacy.blank("en") -> word_count = corpus.count_train(nlp) -> ``` - -| Name | Type | Description | -| ----------- | ---------- | ------------------------- | -| `nlp` | `Language` | The current `nlp` object. | -| **RETURNS** | int | The word count. | - - +| Name | Type | Description | +| ---------- | ---------- | ------------------------- | +| `nlp` | `Language` | The current `nlp` object. | +| **YIELDS** | `Example` | The examples. | diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index a18e9e582..e56e85e64 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("parser", config=config) > ``` + + | Setting | Type | Description | Default | | ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- | -| `moves` | list | | `None` | +| `moves` | list | | `None` | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | ```python @@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). + + | Name | Type | Description | | ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The shared vocabulary. | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `moves` | list | | +| `moves` | list | | | _keyword-only_ | | | -| `update_with_oracle_cut_size` | int | | -| `multitasks` | `Iterable` | | -| `learn_tokens` | bool | | -| `min_action_freq` | int | | +| `update_with_oracle_cut_size` | int | | +| `multitasks` | `Iterable` | | +| `learn_tokens` | bool | | +| `min_action_freq` | int | | ## DependencyParser.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 06b4ade60..18d9c5edd 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -65,6 +65,8 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). + + | Name | Type | Description | | ---------------- | --------------- | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The shared vocabulary. | @@ -126,7 +128,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and ## EntityLinker.begin_training {#begin_training tag="method"} Initialize the pipe for training, using data examples if available. Returns an -[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example > diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index b5b549a04..0ab17f953 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("ner", config=config) > ``` + + | Setting | Type | Description | Default | | ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- | -| `moves` | list | | `None` | +| `moves` | list | | `None` | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | ```python @@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). + + | Name | Type | Description | | ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The shared vocabulary. | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `moves` | list | | +| `moves` | list | | | _keyword-only_ | | | -| `update_with_oracle_cut_size` | int | | -| `multitasks` | `Iterable` | | -| `learn_tokens` | bool | | -| `min_action_freq` | int | | +| `update_with_oracle_cut_size` | int | | +| `multitasks` | `Iterable` | | +| `learn_tokens` | bool | | +| `min_action_freq` | int | | ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/example.md b/website/docs/api/example.md index d3f61c7e2..8c117aec7 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -289,7 +289,6 @@ Calculate alignment tables between two tokenizations. | `x2y` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `x` to `y`. | | `y2x` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `y` to `x`. | - The current implementation of the alignment algorithm assumes that both @@ -310,8 +309,9 @@ tokenizations add up to the same string. For example, you'll be able to align > a2b = alignment.x2y > assert list(a2b.dataXd) == [0, 1, 1, 2] > ``` -> -> If `a2b.dataXd[1] == a2b.dataXd[2] == 1`, that means that `A[1]` (`"'"`) and `A[2]` (`"s"`) both align to `B[1]` (`"'s"`). +> +> If `a2b.dataXd[1] == a2b.dataXd[2] == 1`, that means that `A[1]` (`"'"`) and +> `A[2]` (`"s"`) both align to `B[1]` (`"'s"`). ### Alignment.from_strings {#classmethod tag="function"} @@ -320,4 +320,3 @@ tokenizations add up to the same string. For example, you'll be able to align | `A` | list | String values of candidate tokens to align. | | `B` | list | String values of reference tokens to align. | | **RETURNS** | `Alignment` | An `Alignment` object describing the alignment. | - diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 7e25106d1..7464a029e 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -98,10 +98,10 @@ decorator. For more details and examples, see the | ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `name` | str | The name of the component factory. | | _keyword-only_ | | | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).. | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | @@ -146,10 +146,10 @@ examples, see the | `name` | str | The name of the component factory. | | _keyword-only_ | | | | `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | @@ -302,6 +302,7 @@ Evaluate a model's pipeline components. | `batch_size` | int | The batch size to use. | | `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | | `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| `scorer_cfg` | `Dict[str, Any]` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. | | **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. | ## Language.use_params {#use_params tag="contextmanager, method"} @@ -362,7 +363,7 @@ that take a `Doc` object, modify it and return it. Only one of `before`, As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method doesn't -take callables anymore and instead expects the name of a component factory +take callables anymore and instead expects the **name of a component factory** registered using [`@Language.component`](/api/language#component) or [`@Language.factory`](/api/language#factory). It now takes care of creating the component, adds it to the pipeline and returns it. @@ -378,20 +379,25 @@ component, adds it to the pipeline and returns it. > > nlp.add_pipe("component", before="ner") > component = nlp.add_pipe("component", name="custom_name", last=True) +> +> # Add component from source model +> source_nlp = spacy.load("en_core_web_sm") +> nlp.add_pipe("ner", source=source_nlp) > ``` -| Name | Type | Description | -| -------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `factory_name` | str | Name of the registered component factory. | -| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. | -| _keyword-only_ | | | -| `before` | str / int | Component name or index to insert component directly before. | -| `after` | str / int | Component name or index to insert component directly after: | -| `first` | bool | Insert component first / not first in the pipeline. | -| `last` | bool | Insert component last / not last in the pipeline. | -| `config` 3 | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. | -| `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | -| **RETURNS** 3 | callable | The pipeline component. | +| Name | Type | Description | +| -------------------------------------- | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `factory_name` | str | Name of the registered component factory. | +| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. | +| _keyword-only_ | | | +| `before` | str / int | Component name or index to insert component directly before. | +| `after` | str / int | Component name or index to insert component directly after: | +| `first` | bool | Insert component first / not first in the pipeline. | +| `last` | bool | Insert component last / not last in the pipeline. | +| `config` 3 | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. | +| `source` 3 | `Language` | Optional source model to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source model match the target model. | +| `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | +| **RETURNS** 3 | callable | The pipeline component. | ## Language.has_factory {#has_factory tag="classmethod" new="3"} @@ -597,6 +603,97 @@ contains the information about the component and its default provided by the | `name` | str | The pipeline component name. | | **RETURNS** | [`FactoryMeta`](#factorymeta) |  The factory meta. | +## Language.analyze_pipes {#analyze_pipes tag="method" new="3"} + +Analyze the current pipeline components and show a summary of the attributes +they assign and require, and the scores they set. The data is based on the +information provided in the [`@Language.component`](/api/language#component) and +[`@Language.factory`](/api/language#factory) decorator. If requirements aren't +met, e.g. if a component specifies a required property that is not set by a +previous component, a warning is shown. + + + +The pipeline analysis is static and does **not actually run the components**. +This means that it relies on the information provided by the components +themselves. If a custom component declares that it assigns an attribute but it +doesn't, the pipeline analysis won't catch that. + + + +> #### Example +> +> ```python +> nlp = spacy.blank("en") +> nlp.add_pipe("tagger") +> nlp.add_pipe("entity_linker") +> analysis = nlp.analyze_pipes() +> ``` + + + +```json +### Structured +{ + "summary": { + "tagger": { + "assigns": ["token.tag"], + "requires": [], + "scores": ["tag_acc", "pos_acc", "lemma_acc"], + "retokenizes": false + }, + "entity_linker": { + "assigns": ["token.ent_kb_id"], + "requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], + "scores": [], + "retokenizes": false + } + }, + "problems": { + "tagger": [], + "entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"] + }, + "attrs": { + "token.ent_iob": { "assigns": [], "requires": ["entity_linker"] }, + "doc.ents": { "assigns": [], "requires": ["entity_linker"] }, + "token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] }, + "doc.sents": { "assigns": [], "requires": ["entity_linker"] }, + "token.tag": { "assigns": ["tagger"], "requires": [] }, + "token.ent_type": { "assigns": [], "requires": ["entity_linker"] } + } +} +``` + +``` +### Pretty +============================= Pipeline Overview ============================= + +# Component Assigns Requires Scores Retokenizes +- ------------- --------------- -------------- --------- ----------- +0 tagger token.tag tag_acc False + pos_acc + lemma_acc + +1 entity_linker token.ent_kb_id doc.ents False + doc.sents + token.ent_iob + token.ent_type + + +================================ Problems (4) ================================ +⚠ 'entity_linker' requirements not met: doc.ents, doc.sents, +token.ent_iob, token.ent_type +``` + + + +| Name | Type | Description | +| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. | +| `pretty` | bool | Pretty-print the results as a table. Defaults to `False`. | +| **RETURNS** | dict | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). | + ## Language.meta {#meta tag="property"} Custom meta data for the Language class. If a model is loaded, contains meta @@ -832,8 +929,8 @@ instance and factory instance. | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `factory` | str | The name of the registered component factory. | | `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.   | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis.   | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).  | +| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis).  | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index ac7146543..bfe5c3c77 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -63,14 +63,16 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). + + | Name | Type | Description | | -------------- | ------- | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The shared vocabulary. | | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | _keyword-only_ | | | -| `labels_morph` | dict | | -| `labels_pos` | dict | | +| `labels_morph` | dict | | +| `labels_pos` | dict | | ## Morphologizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index f50a13099..2f37843a0 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -6,10 +6,9 @@ source: spacy/scorer.py --- The `Scorer` computes evaluation scores. It's typically created by -[`Language.evaluate`](/api/language#evaluate). - -In addition, the `Scorer` provides a number of evaluation methods for evaluating -`Token` and `Doc` attributes. +[`Language.evaluate`](/api/language#evaluate). In addition, the `Scorer` +provides a number of evaluation methods for evaluating [`Token`](/api/token) and +[`Doc`](/api/doc) attributes. ## Scorer.\_\_init\_\_ {#init tag="method"} @@ -20,10 +19,10 @@ Create a new `Scorer`. > ```python > from spacy.scorer import Scorer > -> # default scoring pipeline +> # Default scoring pipeline > scorer = Scorer() > -> # provided scoring pipeline +> # Provided scoring pipeline > nlp = spacy.load("en_core_web_sm") > scorer = Scorer(nlp) > ``` @@ -40,16 +39,20 @@ scoring methods provided by the components in the pipeline. The returned `Dict` contains the scores provided by the individual pipeline components. For the scoring methods provided by the `Scorer` and use by the core pipeline components, the individual score names start with the `Token` or `Doc` -attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`, -`pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc`, `dep_uas`, `dep_las`, -`dep_las_per_type`, `ents_p/r/f`, `ents_per_type`, `textcat_macro_auc`, -`textcat_macro_f`. +attribute being scored: + +- `token_acc`, `token_p`, `token_r`, `token_f`, +- `sents_p`, `sents_r`, `sents_f` +- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc` +- `dep_uas`, `dep_las`, `dep_las_per_type` +- `ents_p`, `ents_r` `ents_f`, `ents_per_type` +- `textcat_macro_auc`, `textcat_macro_f` > #### Example > > ```python > scorer = Scorer() -> scorer.score(examples) +> scores = scorer.score(examples) > ``` | Name | Type | Description | @@ -57,78 +60,148 @@ attribute being scored: `token_acc`, `token_p/r/f`, `sents_p/r/f`, `tag_acc`, | `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | **RETURNS** | `Dict` | A dictionary of scores. | -## Scorer.score_tokenization {#score_tokenization tag="staticmethod"} +## Scorer.score_tokenization {#score_tokenization tag="staticmethod" new="3"} Scores the tokenization: -- `token_acc`: # correct tokens / # gold tokens -- `token_p/r/f`: PRF for token character spans +- `token_acc`: number of correct tokens / number of gold tokens +- `token_p`, `token_r`, `token_f`: precision, recall and F-score for token + character spans + +> #### Example +> +> ```python +> scores = Scorer.score_tokenization(examples) +> ``` | Name | Type | Description | | ----------- | ------------------- | --------------------------------------------------------------------------------------------- | | `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc/p/r/f`. | +| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. | -## Scorer.score_token_attr {#score_token_attr tag="staticmethod"} +## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"} Scores a single token attribute. -| Name | Type | Description | -| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | -| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | -| **RETURNS** | `Dict` | A dictionary containing the score `attr_acc`. | +> #### Example +> +> ```python +> scores = Scorer.score_token_attr(examples, "pos") +> print(scores["pos_acc"]) +> ``` -## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod"} +| Name | Type | Description | +| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | +| `attr` | `str` | The attribute to score. | +| _keyword-only_ | | | +| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | +| **RETURNS** | `Dict[str, float]` | A dictionary containing the score `{attr}_acc`. | -Scores a single token attribute per feature for a token attribute in UFEATS +## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"} + +Scores a single token attribute per feature for a token attribute in +[UFEATS](https://universaldependencies.org/format.html#morphological-annotation) format. -| Name | Type | Description | -| ----------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | -| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | -| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores unders the key `attr_per_feat`. | +> #### Example +> +> ```python +> scores = Scorer.score_token_attr_per_feat(examples, "morph") +> print(scores["morph_per_feat"]) +> ``` -## Scorer.score_spans {#score_spans tag="staticmethod"} +| Name | Type | Description | +| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | +| `attr` | `str` | The attribute to score. | +| _keyword-only_ | | | +| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | +| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. | + +## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} Returns PRF scores for labeled or unlabeled spans. -| Name | Type | Description | -| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | -| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. | -| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `attr_p/r/f` and the per-type PRF scores under `attr_per_type`. | +> #### Example +> +> ```python +> scores = Scorer.score_spans(examples, "ents") +> print(scores["ents_f"]) +> ``` -## Scorer.score_deps {#score_deps tag="staticmethod"} +| Name | Type | Description | +| -------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | +| `attr` | `str` | The attribute to score. | +| _keyword-only_ | | | +| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. | +| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. | + +## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} Calculate the UAS, LAS, and LAS per type scores for dependency parses. +> #### Example +> +> ```python +> def dep_getter(token, attr): +> dep = getattr(token, attr) +> dep = token.vocab.strings.as_string(dep).lower() +> return dep +> +> scores = Scorer.score_deps( +> examples, +> "dep", +> getter=dep_getter, +> ignore_labels=("p", "punct") +> ) +> print(scores["dep_uas"], scores["dep_las"]) +> ``` + | Name | Type | Description | | --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `attr` | `str` | The attribute containing the dependency label. | -| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | +| _keyword-only_ | | | +| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | | `head_attr` | `str` | The attribute containing the head token. | | `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. | | `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). | -| **RETURNS** | `Dict` | A dictionary containing the scores: `attr_uas`, `attr_las`, and `attr_las_per_type`. | +| **RETURNS** | `Dict` | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. | -## Scorer.score_cats {#score_cats tag="staticmethod"} +## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict containing scores for each label like `Doc.cats`. The reported overall score -depends on the scorer settings. +depends on the scorer settings: -| Name | Type | Description | -| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | -| `getter` | `callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. | -| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. | -| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. | -| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. | -| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`: 1) for all: `attr_score` (one of `attr_f` / `attr_macro_f` / `attr_macro_auc`), `attr_score_desc` (text description of the overall score), `attr_f_per_type`, `attr_auc_per_type`; 2) for binary exclusive with positive label: `attr_p/r/f`; 3) for 3+ exclusive classes, macro-averaged fscore: `attr_macro_f`; 4) for multilabel, macro-averaged AUC: `attr_macro_auc` | +1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` / + `{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall + score), `{attr}_f_per_type`, `{attr}_auc_per_type` +2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f` +3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`; +4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc` + +> #### Example +> +> ```python +> labels = ["LABEL_A", "LABEL_B", "LABEL_C"] +> scores = Scorer.score_cats( +> examples, +> "cats", +> labels=labels +> ) +> print(scores["cats_macro_auc"]) +> ``` + +| Name | Type | Description | +| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | +| `attr` | `str` | The attribute to score. | +| _keyword-only_ | | | +| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. | +| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. | +| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. | +| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. | +| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`. | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 68158645d..0954fb577 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -4,6 +4,7 @@ menu: - ['spacy', 'spacy'] - ['displacy', 'displacy'] - ['registry', 'registry'] + - ['Readers & Batchers', 'readers-batchers'] - ['Data & Alignment', 'gold'] - ['Utility Functions', 'util'] --- @@ -31,12 +32,13 @@ loaded in via [`Language.from_disk`](/api/language#from_disk). > nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"]) > ``` -| Name | Type | Description | -| ------------------------------------------ | ----------------- | --------------------------------------------------------------------------------- | -| `name` | str / `Path` | Model to load, i.e. package name or path. | -| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| `component_cfg` 3 | `Dict[str, dict]` | Optional config overrides for pipeline components, keyed by component names. | -| **RETURNS** | `Language` | A `Language` object with the loaded model. | +| Name | Type | Description | +| ----------------------------------- | ---------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `name` | str / `Path` | Model to load, i.e. package name or path. | +| _keyword-only_ | | | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `config` 3 | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. | +| **RETURNS** | `Language` | A `Language` object with the loaded model. | Essentially, `spacy.load()` is a convenience wrapper that reads the language ID and pipeline components from a model's `meta.json`, initializes the `Language` @@ -83,11 +85,12 @@ meta data as a dictionary instead, you can use the `meta` attribute on your > markdown = spacy.info(markdown=True, silent=True) > ``` -| Name | Type | Description | -| ---------- | ---- | ------------------------------------------------ | -| `model` | str | A model, i.e. a package name or path (optional). | -| `markdown` | bool | Print information as Markdown. | -| `silent` | bool | Don't print anything, just return. | +| Name | Type | Description | +| -------------- | ---- | ------------------------------------------------ | +| `model` | str | A model, i.e. a package name or path (optional). | +| _keyword-only_ | | | +| `markdown` | bool | Print information as Markdown. | +| `silent` | bool | Don't print anything, just return. | ### spacy.explain {#spacy.explain tag="function"} @@ -290,6 +293,8 @@ factories. > return Model("custom", forward, dims={"nO": nO}) > ``` + + | Registry name | Description | | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | @@ -297,7 +302,10 @@ factories. | `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). | | `lookups` | Registry for large lookup tables available via `vocab.lookups`. | | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | -| `assets` | | +| `assets` | | +| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | +| `readers` | Registry for training and evaluation [data readers](#readers-batchers). | +| `batchers` | Registry for training and evaluation [data batchers](#readers-batchers). | | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | | `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). | @@ -324,10 +332,117 @@ See the [`Transformer`](/api/transformer) API reference and > return annotation_sette > ``` -| Registry name | Description | -| ------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. | -| [`annotation_setters`](/api/transformers#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | +| Registry name | Description | +| ----------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. | +| [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | + +## Data readers and batchers {#readers-batchers new="3"} + + + +### spacy.Corpus.v1 {#corpus tag="registered function" source="spacy/gold/corpus.py"} + +Registered function that creates a [`Corpus`](/api/corpus) of training or +evaluation data. It takes the same arguments as the `Corpus` class and returns a +callable that yields [`Example`](/api/example) objects. You can replace it with +your own registered function in the [`@readers` registry](#regsitry) to +customize the data loading and streaming. + +> #### Example config +> +> ```ini +> [paths] +> train = "corpus/train.spacy" +> +> [training.train_corpus] +> @readers = "spacy.Corpus.v1" +> path = ${paths:train} +> gold_preproc = false +> max_length = 0 +> limit = 0 +> ``` + +| Name | Type | Description | +| --------------- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | `Path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). | +|  `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. | +| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. | +| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. | + +### Batchers {#batchers source="spacy/gold/batchers.py"} + + + +#### batch_by_words.v1 {#batch_by_words tag="registered function"} + +Create minibatches of roughly a given number of words. If any examples are +longer than the specified batch length, they will appear in a batch by +themselves, or be discarded if `discard_oversize` is set to `True`. The argument +`docs` can be a list of strings, [`Doc`](/api/doc) objects or +[`Example`](/api/example) objects. + +> #### Example config +> +> ```ini +> [training.batcher] +> @batchers = "batch_by_words.v1" +> size = 100 +> tolerance = 0.2 +> discard_oversize = false +> get_length = null +> ``` + + + +| Name | Type | Description | +| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | +| `tolerance` | float | | +| `discard_oversize` | bool | Discard items that are longer than the specified batch length. | +| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. | + +#### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"} + + + +> #### Example config +> +> ```ini +> [training.batcher] +> @batchers = "batch_by_sequence.v1" +> size = 32 +> get_length = null +> ``` + + + +| Name | Type | Description | +| ------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | +| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. | + +#### batch_by_padded.v1 {#batch_by_padded tag="registered function"} + + + +> #### Example config +> +> ```ini +> [training.batcher] +> @batchers = "batch_by_words.v1" +> size = 100 +> buffer = TODO: +> discard_oversize = false +> get_length = null +> ``` + +| Name | Type | Description | +| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | +| `buffer` | int | | +| `discard_oversize` | bool | Discard items that are longer than the specified batch length. | +| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. | ## Training data and alignment {#gold source="spacy/gold"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 70128d225..6b6be6bd0 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -347,50 +347,52 @@ serialization by passing in the string names via the `exclude` argument. Transformer tokens and outputs for one `Doc` object. -| Name | Type | Description | -| --------- | -------------------------------------------------- | ----------------------------------------- | -| `tokens` | `Dict` | | -| `tensors` | `List[FloatsXd]` | | -| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | -| `width` | int | | + + +| Name | Type | Description | +| --------- | -------------------------------------------------- | ----------- | +| `tokens` | `Dict` | | +| `tensors` | `List[FloatsXd]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `width` | int | | ### TransformerData.empty {#transformerdata-emoty tag="classmethod"} - + -| Name | Type | Description | -| ----------- | ----------------- | -------------- | -| **RETURNS** | `TransformerData` | | +| Name | Type | Description | +| ----------- | ----------------- | ----------- | +| **RETURNS** | `TransformerData` | | ## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} - + -| Name | Type | Description | -| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | -| `spans` | `List[List[Span]]` | | -| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | | -| `tensors` | `List[torch.Tensor]` | | -| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | -| `doc_data` | `List[TransformerData]` | | +| Name | Type | Description | +| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------- | +| `spans` | `List[List[Span]]` | | +| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | | +| `tensors` | `List[torch.Tensor]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `doc_data` | `List[TransformerData]` | | ### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"} - + -| Name | Type | Description | -| ----------- | ---------------------- | -------------- | -| `arrays` | `List[List[Floats3d]]` | | -| **RETURNS** | `FullTransformerBatch` | | +| Name | Type | Description | +| ----------- | ---------------------- | ----------- | +| `arrays` | `List[List[Floats3d]]` | | +| **RETURNS** | `FullTransformerBatch` | | ### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"} Split a `TransformerData` object that represents a batch into a list with one `TransformerData` per `Doc`. -| Name | Type | Description | -| ----------- | ----------------------- | -------------- | -| **RETURNS** | `List[TransformerData]` | | +| Name | Type | Description | +| ----------- | ----------------------- | ----------- | +| **RETURNS** | `List[TransformerData]` | | ## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} @@ -421,11 +423,13 @@ getters using the `@registry.span_getters` decorator. The following built-in functions are available: + + | Name | Description | | ------------------ | ------------------------------------------------------------------ | | `doc_spans.v1` | Create a span for each doc (no transformation, process each text). | | `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. | -| `strided_spans.v1` | | +| `strided_spans.v1` | | ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"} diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 56ade692a..7c47c0c73 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -231,10 +231,10 @@ available pipeline components and component functions. | `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | | `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | | `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | -| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | | +| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | | | `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | - + @@ -311,6 +311,155 @@ nlp.rename_pipe("ner", "entityrecognizer") nlp.replace_pipe("tagger", my_custom_tagger) ``` +### Sourcing pipeline components from existing models {#sourced-components new="3"} + +Pipeline components that are independent can also be reused across models. +Instead of adding a new blank component to a pipeline, you can also copy an +existing component from a pretrained model by setting the `source` argument on +[`nlp.add_pipe`](/api/language#add_pipe). The first argument will then be +interpreted as the name of the component in the source pipeline – for instance, +`"ner"`. This is especially useful for +[training a model](/usage/training#config-components) because it lets you mix +and match components and create fully custom model packages with updated +pretrained components and new components trained on your data. + + + +When reusing components across models, keep in mind that the **vocabulary**, +**vectors** and model settings **must match**. If a pretrained model includes +[word vectors](/usage/vectors-embeddings) and the component uses them as +features, the model you copy it to needs to have the _same_ vectors available – +otherwise, it won't be able to make the same predictions. + + + +> #### In training config +> +> Instead of providing a `factory`, component blocks in the training +> [config](/usage/training#config) can also define a `source`. The string needs +> to be a loadable spaCy model package or path. The +> +> ```ini +> [components.ner] +> source = "en_core_web_sm" +> component = "ner" +> ``` +> +> By default, sourced components will be updated with your data during training. +> If you want to preserve the component as-is, you can "freeze" it: +> +> ```ini +> [training] +> frozen_components = ["ner"] +> ``` + +```python +### {executable="true"} +import spacy + +# The source model with different components +source_nlp = spacy.load("en_core_web_sm") +print(source_nlp.pipe_names) + +# Add only the entity recognizer to the new blank model +nlp = spacy.blank("en") +nlp.add_pipe("ner", source=source_nlp) +print(nlp.pipe_names) +``` + +### Analyzing pipeline components {#analysis new="3"} + +The [`nlp.analyze_pipes`](/api/language#analyze_pipes) method analyzes the +components in the current pipeline and outputs information about them, like the +attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether +they retokenize the `Doc` and which scores they produce during training. It will +also show warnings if components require values that aren't set by previous +component – for instance, if the entity linker is used but no component that +runs before it sets named entities. Setting `pretty=True` will pretty-print a +table instead of only returning the structured data. + +> #### ✏️ Things to try +> +> 1. Add the components `"ner"` and `"sentencizer"` _before_ the entity linker. +> The analysis should now show no problems, because requirements are met. + +```python +### {executable="true"} +import spacy + +nlp = spacy.blank("en") +nlp.add_pipe("tagger") +# This is a problem because it needs entities and sentence boundaries +nlp.add_pipe("entity_linker") +analysis = nlp.analyze_pipes(pretty=True) +``` + + + +```json +### Structured +{ + "summary": { + "tagger": { + "assigns": ["token.tag"], + "requires": [], + "scores": ["tag_acc", "pos_acc", "lemma_acc"], + "retokenizes": false + }, + "entity_linker": { + "assigns": ["token.ent_kb_id"], + "requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], + "scores": [], + "retokenizes": false + } + }, + "problems": { + "tagger": [], + "entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"] + }, + "attrs": { + "token.ent_iob": { "assigns": [], "requires": ["entity_linker"] }, + "doc.ents": { "assigns": [], "requires": ["entity_linker"] }, + "token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] }, + "doc.sents": { "assigns": [], "requires": ["entity_linker"] }, + "token.tag": { "assigns": ["tagger"], "requires": [] }, + "token.ent_type": { "assigns": [], "requires": ["entity_linker"] } + } +} +``` + +``` +### Pretty +============================= Pipeline Overview ============================= + +# Component Assigns Requires Scores Retokenizes +- ------------- --------------- -------------- --------- ----------- +0 tagger token.tag tag_acc False + pos_acc + lemma_acc + +1 entity_linker token.ent_kb_id doc.ents False + doc.sents + token.ent_iob + token.ent_type + + +================================ Problems (4) ================================ +⚠ 'entity_linker' requirements not met: doc.ents, doc.sents, +token.ent_iob, token.ent_type +``` + + + + + +The pipeline analysis is static and does **not actually run the components**. +This means that it relies on the information provided by the components +themselves. If a custom component declares that it assigns an attribute but it +doesn't, the pipeline analysis won't catch that. + + + ## Creating custom pipeline components {#custom-components} A pipeline component is a function that receives a `Doc` object, modifies it and @@ -489,6 +638,8 @@ All other settings can be passed in by the user via the `config` argument on [`@Language.factory`](/api/language#factory) decorator also lets you define a `default_config` that's used as a fallback. + + ```python ### With config {highlight="4,9"} import spacy diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index cdd7d1c49..904477733 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -15,8 +15,6 @@ import Serialization101 from 'usage/101/\_serialization.md' ### Serializing the pipeline {#pipeline} - - When serializing the pipeline, keep in mind that this will only save out the **binary data for the individual components** to allow spaCy to restore them – not the entire objects. This is a good thing, because it makes serialization diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 12785b6de..c0ec052b9 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -3,9 +3,10 @@ title: Training Models next: /usage/projects menu: - ['Introduction', 'basics'] - - ['CLI & Config', 'cli-config'] - - ['Transfer Learning', 'transfer-learning'] + - ['Quickstart', 'quickstart'] + - ['Config System', 'config'] - ['Custom Models', 'custom-models'] + - ['Transfer Learning', 'transfer-learning'] - ['Parallel Training', 'parallel-training'] - ['Internal API', 'api'] --- @@ -29,12 +30,13 @@ ready-to-use spaCy models. -## Training CLI & config {#cli-config} +### Training CLI & config {#cli-config} The recommended way to train your spaCy models is via the -[`spacy train`](/api/cli#train) command on the command line. +[`spacy train`](/api/cli#train) command on the command line. You can pass in the +following data and information: 1. The **training and evaluation data** in spaCy's [binary `.spacy` format](/api/data-formats#binary-training) created using @@ -42,14 +44,43 @@ The recommended way to train your spaCy models is via the 2. A [`config.cfg`](#config) **configuration file** with all settings and hyperparameters. 3. An optional **Python file** to register - [custom models and architectures](#custom-models). - - + [custom functions and architectures](#custom-code). ```bash $ python -m spacy train train.spacy dev.spacy config.cfg --output ./output ``` + + +The easiest way to get started with an end-to-end training process is to clone a +[project](/usage/projects) template. Projects let you manage multi-step +workflows, from data preprocessing to training and packaging your model. + + + +## Quickstart {#quickstart} + +> #### Instructions +> +> 1. Select your requirements and settings. +> 2. Use the buttons at the bottom to save the result to your clipboard or a +> file `base_config.cfg`. +> 3. Run [`init config`](/api/cli#init-config) to create a full training config. +> 4. Run [`train`](/api/cli#train) with your config and data. + +import QuickstartTraining from 'widgets/quickstart-training.js' + + + +After you've saved the starter config to a file `base_config.cfg`, you can use +the [`init config`](/api/cli#init-config) command to fill in the remaining +defaults. Training configs should always be **complete and without hidden +defaults**, to keep your experiments reproducible. + +```bash +$ python -m spacy init config config.cfg --base base_config.cfg +``` + > #### Tip: Debug your data > > The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate @@ -60,46 +91,15 @@ $ python -m spacy train train.spacy dev.spacy config.cfg --output ./output > $ python -m spacy debug-data en train.spacy dev.spacy --verbose > ``` - +You can now run [`train`](/api/cli#train) with your training and development +data and the training config. See the [`convert`](/api/cli#convert) command for +details on how to convert your data to spaCy's binary `.spacy` format. -The easiest way to get started with an end-to-end training process is to clone a -[project](/usage/projects) template. Projects let you manage multi-step -workflows, from data preprocessing to training and packaging your model. +```bash +$ python -m spacy train train.spacy dev.spacy config.cfg --output ./output +``` - - - - -When you train a model using the [`spacy train`](/api/cli#train) command, you'll -see a table showing metrics after each pass over the data. Here's what those -metrics means: - - - -| Name | Description | -| ---------- | ------------------------------------------------------------------------------------------------- | -| `Dep Loss` | Training loss for dependency parser. Should decrease, but usually not to 0. | -| `NER Loss` | Training loss for named entity recognizer. Should decrease, but usually not to 0. | -| `UAS` | Unlabeled attachment score for parser. The percentage of unlabeled correct arcs. Should increase. | -| `NER P.` | NER precision on development data. Should increase. | -| `NER R.` | NER recall on development data. Should increase. | -| `NER F.` | NER F-score on development data. Should increase. | -| `Tag %` | Fine-grained part-of-speech tag accuracy on development data. Should increase. | -| `Token %` | Tokenization accuracy on development data. | -| `CPU WPS` | Prediction speed on CPU in words per second, if available. Should stay stable. | -| `GPU WPS` | Prediction speed on GPU in words per second, if available. Should stay stable. | - -Note that if the development data has raw text, some of the gold-standard -entities might not align to the predicted tokenization. These tokenization -errors are **excluded from the NER evaluation**. If your tokenization makes it -impossible for the model to predict 50% of your entities, your NER F-score might -still look good. - - - ---- - -### Training config files {#config} +## Training config {#config} > #### Migration from spaCy v2.x > @@ -149,12 +149,14 @@ not just define static settings, but also construct objects like architectures, schedules, optimizers or any other custom components. The main top-level sections of a config file are: -| Section | Description | -| ------------- | --------------------------------------------------------------------------------------------------------------------- | -| `training` | Settings and controls for the training and evaluation process. | -| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). | -| `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. | -| `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. | +| Section | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. | +| `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. | +| `paths` | Paths to data and other assets. Can be re-used across the config as variables, e.g. `${paths:train}`, and [overwritten](#config-overrides) on the CLI. | +| `system` | Settings related to system and hardware. | +| `training` | Settings and controls for the training and evaluation process. | +| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). | @@ -168,7 +170,7 @@ available for the different architectures are documented with the -#### Overwriting config settings on the command line {#config-overrides} +### Overwriting config settings on the command line {#config-overrides} The config system means that you can define all settings **in one place** and in a consistent format. There are no command-line arguments that need to be set, @@ -192,7 +194,87 @@ of the training, the final filled `config.cfg` is exported with your model, so you'll always have a record of the settings that were used, including your overrides. -#### Using registered functions {#config-functions} +### Defining pipeline components {#config-components} + +When you train a model, you typically train a +[pipeline](/usage/processing-pipelines) of **one or more components**. The +`[components]` block in the config defines the available pipeline components and +how they should be created – either by a built-in or custom +[factory](/usage/processing-pipelines#built-in), or +[sourced](/usage/processing-pipelines#sourced-components) from an existing +pretrained model. For example, `[components.parser]` defines the component named +`"parser"` in the pipeline. There are different ways you might want to treat +your components during training, and the most common scenarios are: + +1. Train a **new component** from scratch on your data. +2. Update an existing **pretrained component** with more examples. +3. Include an existing pretrained component without updating it. +4. Include a non-trainable component, like a rule-based + [`EntityRuler`](/api/entityruler) or [`Sentencizer`](/api/sentencizer), or a + fully [custom component](/usage/processing-pipelines#custom-components). + +If a component block defines a `factory`, spaCy will look it up in the +[built-in](/usage/processing-pipelines#built-in) or +[custom](/usage/processing-pipelines#custom-components) components and create a +new component from scratch. All settings defined in the config block will be +passed to the component factory as arguments. This lets you configure the model +settings and hyperparameters. If a component block defines a `source`, the +component will be copied over from an existing pretrained model, with its +existing weights. This lets you include an already trained component in your +model pipeline, or update a pretrained components with more data specific to +your use case. + +```ini +### config.cfg (excerpt) +[components] + +# "parser" and "ner" are sourced from pretrained model +[components.parser] +source = "en_core_web_sm" + +[components.ner] +source = "en_core_web_sm" + +# "textcat" and "custom" are created blank from built-in / custom factory +[components.textcat] +factory = "textcat" + +[components.custom] +factory = "your_custom_factory" +your_custom_setting = true +``` + +The `pipeline` setting in the `[nlp]` block defines the pipeline components +added to the pipeline, in order. For example, `"parser"` here references +`[components.parser]`. By default, spaCy will **update all components that can +be updated**. Trainable components that are created from scratch are initialized +with random weights. For sourced components, spaCy will keep the existing +weights and [resume training](/api/language#resume_training). + +If you don't want a component to be updated, you can **freeze** it by adding it +to the `frozen_components` list in the `[training]` block. Frozen components are +**not updated** during training and are included in the final trained model +as-is. + +> #### Note on frozen components +> +> Even though frozen components are not **updated** during training, they will +> still **run** during training and evaluation. This is very important, because +> they may still impact your model's performance – for instance, a sentence +> boundary detector can impact what the parser or entity recognizer considers a +> valid parse. So the evaluation results should always reflect what your model +> will produce at runtime. + +```ini +[nlp] +lang = "en" +pipeline = ["parser", "ner", "textcat", "custom"] + +[training] +frozen_components = ["parser", "custom"] +``` + +### Using registered functions {#config-functions} The training configuration defined in the config file doesn't have to only consist of static values. Some settings can also be **functions**. For instance, @@ -233,40 +315,78 @@ stop = 1000 compound = 1.001 ``` +### Using variable interpolation {#config-interpolation} + + + ### Model architectures {#model-architectures} - +### Metrics, training output and weighted scores {#metrics} -## Transfer learning {#transfer-learning} +When you train a model using the [`spacy train`](/api/cli#train) command, you'll +see a table showing the metrics after each pass over the data. The available +metrics **depend on the pipeline components**. Pipeline components also define +which scores are shown and how they should be **weighted in the final score** +that decides about the best model. -### Using transformer models like BERT {#transformers} +The `training.score_weights` setting in your `config.cfg` lets you customize the +scores shown in the table and how they should be weighted. In this example, the +labeled dependency accuracy and NER F-score count towards the final score with +40% each and the tagging accuracy makes up the remaining 20%. The tokenization +accuracy and speed are both shown in the table, but not counted towards the +score. -spaCy v3.0 lets you use almost any statistical model to power your pipeline. You -can use models implemented in a variety of frameworks. A transformer model is -just a statistical model, so the -[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package -actually has very little work to do: it just has to provide a few functions that -do the required plumbing. It also provides a pipeline component, -[`Transformer`](/api/transformer), that lets you do multi-task learning and lets -you save the transformer outputs for later use. +> #### Why do I need score weights? +> +> At the end of your training process, you typically want to select the **best +> model** – but what "best" means depends on the available components and your +> specific use case. For instance, you may prefer a model with higher NER and +> lower POS tagging accuracy over a model with lower NER and higher POS +> accuracy. You can express this preference in the score weights, e.g. by +> assigning `ents_f` (NER F-score) a higher weight. - +```ini +[training.score_weights] +dep_las = 0.4 +ents_f = 0.4 +tag_acc = 0.2 +token_acc = 0.0 +speed = 0.0 +``` -Try out a BERT-based model pipeline using this project template: swap in your -data, edit the settings and hyperparameters and train, evaluate, package and -visualize your model. +The `score_weights` don't _have to_ sum to `1.0` – but it's recommended. When +you generate a config for a given pipeline, the score weights are generated by +combining and normalizing the default score weights of the pipeline components. +The default score weights are defined by each pipeline component via the +`default_score_weights` setting on the +[`@Language.component`](/api/language#component) or +[`@Language.factory`](/api/language#factory). By default, all pipeline +components are weighted equally. - + -For more details on how to integrate transformer models into your training -config and customize the implementations, see the usage guide on -[training transformers](/usage/transformers#training). + -### Pretraining with spaCy {#pretraining} +| Name | Description | +| -------------------------- | ----------------------------------------------------------------------------------------------------------------------- | +| **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. | +| **Precision** (P) | Should increase. | +| **Recall** (R) | Should increase. | +| **F-Score** (F) | The weighted average of precision and recall. Should increase. | +| **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. | +| **Words per second** (WPS) | Prediction speed in words per second. Should stay stable. | - + + +Note that if the development data has raw text, some of the gold-standard +entities might not align to the predicted tokenization. These tokenization +errors are **excluded from the NER evaluation**. If your tokenization makes it +impossible for the model to predict 50% of your entities, your NER F-score might +still look good. + + ## Custom model implementations and architectures {#custom-models} @@ -274,6 +394,11 @@ config and customize the implementations, see the usage guide on ### Training with custom code {#custom-code} +> ```bash +> ### Example {wrap="true"} +> $ python -m spacy train train.spacy dev.spacy config.cfg --code functions.py +> ``` + The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument `--code` that points to a Python file. The file is imported before training and allows you to add custom functions and architectures to the function registry @@ -281,6 +406,120 @@ that can then be referenced from your `config.cfg`. This lets you train spaCy models with custom components, without having to re-implement the whole training workflow. +#### Example: Modifying the nlp object {#custom-code-nlp-callbacks} + +For many use cases, you don't necessarily want to implement the whole `Language` +subclass and language data from scratch – it's often enough to make a few small +modifications, like adjusting the +[tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or +[language defaults](/api/language#defaults) like stop words. The config lets you +provide three optional **callback functions** that give you access to the +language class and `nlp` object at different points of the lifecycle: + +| Callback | Description | +| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). | +| `after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer. | +| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. | + +The `@spacy.registry.callbacks` decorator lets you register that function in the +`callbacks` [registry](/api/top-level#registry) under a given name. You can then +reference the function in a config block using the `@callbacks` key. If a block +contains a key starting with an `@`, it's interpreted as a reference to a +function. Because you've registered the function, spaCy knows how to create it +when you reference `"customize_language_data"` in your config. Here's an example +of a callback that runs before the `nlp` object is created and adds a few custom +tokenization rules to the defaults: + +> #### config.cfg +> +> ```ini +> [nlp.before_creation] +> @callbacks = "customize_language_data" +> ``` + +```python +### functions.py {highlight="3,6"} +import spacy + +@spacy.registry.callbacks("customize_language_data") +def create_callback(): + def customize_language_data(lang_cls): + lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",) + return lang_cls + + return customize_language_data +``` + + + +Remember that a registered function should always be a function that spaCy +**calls to create something**. In this case, it **creates a callback** – it's +not the callback itself. + + + +Any registered function – in this case `create_callback` – can also take +**arguments** that can be **set by the config**. This lets you implement and +keep track of different configurations, without having to hack at your code. You +can choose any arguments that make sense for your use case. In this example, +we're adding the arguments `extra_stop_words` (a list of strings) and `debug` +(boolean) for printing additional info when the function runs. + +> #### config.cfg +> +> ```ini +> [nlp.before_creation] +> @callbacks = "customize_language_data" +> extra_stop_words = ["ooh", "aah"] +> debug = true +> ``` + +```python +### functions.py {highlight="5,8-10"} +from typing import List +import spacy + +@spacy.registry.callbacks("customize_language_data") +def create_callback(extra_stop_words: List[str] = [], debug: bool = False): + def customize_language_data(lang_cls): + lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",) + lang_cls.Defaults.stop_words.add(extra_stop_words) + if debug: + print("Updated stop words and tokenizer suffixes") + return lang_cls + + return customize_language_data +``` + + + +spaCy's configs are powered by our machine learning library Thinc's +[configuration system](https://thinc.ai/docs/usage-config), which supports +[type hints](https://docs.python.org/3/library/typing.html) and even +[advanced type annotations](https://thinc.ai/docs/usage-config#advanced-types) +using [`pydantic`](https://github.com/samuelcolvin/pydantic). If your registered +function provides type hints, the values that are passed in will be checked +against the expected types. For example, `debug: bool` in the example above will +ensure that the value received as the argument `debug` is an boolean. If the +value can't be coerced into a boolean, spaCy will raise an error. +`start: pydantic.StrictBool` will force the value to be an boolean and raise an +error if it's not – for instance, if your config defines `1` instead of `true`. + + + +With your `functions.py` defining additional code and the updated `config.cfg`, +you can now run [`spacy train`](/api/cli#train) and point the argument `--code` +to your Python file. Before loading the config, spaCy will import the +`functions.py` module and your custom functions will be registered. + +```bash +### Training with custom code {wrap="true"} +python -m spacy train train.spacy dev.spacy config.cfg --output ./output --code ./functions.py +``` + +#### Example: Custom batch size schedule {#custom-code-schedule} + For example, let's say you've implemented your own batch size schedule to use during training. The `@spacy.registry.schedules` decorator lets you register that function in the `schedules` [registry](/api/top-level#registry) and assign @@ -310,9 +549,9 @@ In your config, you can now reference the schedule in the starting with an `@`, it's interpreted as a reference to a function. All other settings in the block will be passed to the function as keyword arguments. Keep in mind that the config shouldn't have any hidden defaults and all arguments on -the functions need to be represented in the config. - - +the functions need to be represented in the config. If your function defines +**default argument values**, spaCy is able to auto-fill your config when you run +[`init config`](/api/cli#init-config). ```ini ### config.cfg (excerpt) @@ -322,31 +561,9 @@ start = 2 factor = 1.005 ``` -You can now run [`spacy train`](/api/cli#train) with the `config.cfg` and your -custom `functions.py` as the argument `--code`. Before loading the config, spaCy -will import the `functions.py` module and your custom functions will be -registered. +#### Example: Custom data reading and batching {#custom-code-readers-batchers} -```bash -### Training with custom code {wrap="true"} -python -m spacy train train.spacy dev.spacy config.cfg --output ./output --code ./functions.py -``` - - - -spaCy's configs are powered by our machine learning library Thinc's -[configuration system](https://thinc.ai/docs/usage-config), which supports -[type hints](https://docs.python.org/3/library/typing.html) and even -[advanced type annotations](https://thinc.ai/docs/usage-config#advanced-types) -using [`pydantic`](https://github.com/samuelcolvin/pydantic). If your registered -function provides type hints, the values that are passed in will be checked -against the expected types. For example, `start: int` in the example above will -ensure that the value received as the argument `start` is an integer. If the -value can't be cast to an integer, spaCy will raise an error. -`start: pydantic.StrictInt` will force the value to be an integer and raise an -error if it's not – for instance, if your config defines a float. - - + ### Wrapping PyTorch and TensorFlow {#custom-frameworks} @@ -364,6 +581,35 @@ mattis pretium. +## Transfer learning {#transfer-learning} + +### Using transformer models like BERT {#transformers} + +spaCy v3.0 lets you use almost any statistical model to power your pipeline. You +can use models implemented in a variety of frameworks. A transformer model is +just a statistical model, so the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package +actually has very little work to do: it just has to provide a few functions that +do the required plumbing. It also provides a pipeline component, +[`Transformer`](/api/transformer), that lets you do multi-task learning and lets +you save the transformer outputs for later use. + + + +Try out a BERT-based model pipeline using this project template: swap in your +data, edit the settings and hyperparameters and train, evaluate, package and +visualize your model. + + + +For more details on how to integrate transformer models into your training +config and customize the implementations, see the usage guide on +[training transformers](/usage/transformers#training). + +### Pretraining with spaCy {#pretraining} + + + ## Parallel Training with Ray {#parallel-training} diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md index bab1b82d3..b837c62de 100644 --- a/website/docs/usage/transformers.md +++ b/website/docs/usage/transformers.md @@ -88,7 +88,8 @@ The recommended workflow for training is to use spaCy's [`spacy train`](/api/cli#train) command. The training config defines all component settings and hyperparameters in one place and lets you describe a tree of objects by referring to creation functions, including functions you register -yourself. +yourself. For details on how to get started with training your own model, check +out the [training quickstart](/usage/training#quickstart). @@ -164,10 +165,8 @@ resolved, the function is created and passed into the model as an argument. Remember that the `config.cfg` used for training should contain **no missing values** and requires all settings to be defined. You don't want any hidden defaults creeping in and changing your results! spaCy will tell you if settings -are missing, and you can run [`spacy debug config`](/api/cli#debug-config) with -`--auto-fill` to automatically fill in all defaults. - - +are missing, and you can run [`spacy init config`](/api/cli#init-config) with to +automatically fill in all defaults. diff --git a/website/src/components/copy.js b/website/src/components/copy.js index 4392273e2..f8013c5f1 100644 --- a/website/src/components/copy.js +++ b/website/src/components/copy.js @@ -3,21 +3,23 @@ import React, { useState, useRef } from 'react' import Icon from './icon' import classes from '../styles/copy.module.sass' +export function copyToClipboard(ref, callback) { + const isClient = typeof window !== 'undefined' + if (ref.current && isClient) { + ref.current.select() + document.execCommand('copy') + callback(true) + ref.current.blur() + setTimeout(() => callback(false), 1000) + } +} + const CopyInput = ({ text, prefix }) => { const isClient = typeof window !== 'undefined' const supportsCopy = isClient && document.queryCommandSupported('copy') const textareaRef = useRef() const [copySuccess, setCopySuccess] = useState(false) - - function copyToClipboard() { - if (textareaRef.current && isClient) { - textareaRef.current.select() - document.execCommand('copy') - setCopySuccess(true) - textareaRef.current.blur() - setTimeout(() => setCopySuccess(false), 1000) - } - } + const onClick = () => copyToClipboard(textareaRef, setCopySuccess) function selectText() { if (textareaRef.current && isClient) { @@ -37,7 +39,7 @@ const CopyInput = ({ text, prefix }) => { onClick={selectText} /> {supportsCopy && ( - )} diff --git a/website/src/components/icon.js b/website/src/components/icon.js index 8c917d13d..00b237795 100644 --- a/website/src/components/icon.js +++ b/website/src/components/icon.js @@ -22,6 +22,7 @@ import { ReactComponent as SearchIcon } from '../images/icons/search.svg' import { ReactComponent as MoonIcon } from '../images/icons/moon.svg' import { ReactComponent as ClipboardIcon } from '../images/icons/clipboard.svg' import { ReactComponent as NetworkIcon } from '../images/icons/network.svg' +import { ReactComponent as DownloadIcon } from '../images/icons/download.svg' import classes from '../styles/icon.module.sass' @@ -46,7 +47,8 @@ const icons = { search: SearchIcon, moon: MoonIcon, clipboard: ClipboardIcon, - network: NetworkIcon + network: NetworkIcon, + download: DownloadIcon, } const Icon = ({ name, width, height, inline, variant, className }) => { diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js index fe73658c7..f1d3616a5 100644 --- a/website/src/components/quickstart.js +++ b/website/src/components/quickstart.js @@ -1,4 +1,4 @@ -import React, { Fragment, useState, useEffect } from 'react' +import React, { Fragment, useState, useEffect, useRef } from 'react' import PropTypes from 'prop-types' import classNames from 'classnames' import { window } from 'browser-monads' @@ -6,6 +6,7 @@ import { window } from 'browser-monads' import Section from './section' import Icon from './icon' import { H2 } from './typography' +import { copyToClipboard } from './copy' import classes from '../styles/quickstart.module.sass' function getNewChecked(optionId, checkedForId, multiple) { @@ -14,10 +15,41 @@ function getNewChecked(optionId, checkedForId, multiple) { return [...checkedForId, optionId] } -const Quickstart = ({ data, title, description, id, children }) => { +function getRawContent(ref) { + if (ref.current && ref.current.childNodes) { + // Select all currently visible nodes (spans and text nodes) + const result = [...ref.current.childNodes].filter(el => el.offsetParent !== null) + return result.map(el => el.textContent).join('\n') + } + return '' +} + +const Quickstart = ({ + data, + title, + description, + copy, + download, + id, + setters = {}, + hidePrompts, + children, +}) => { + const contentRef = useRef() + const copyAreaRef = useRef() + const isClient = typeof window !== 'undefined' + const supportsCopy = isClient && document.queryCommandSupported('copy') + const showCopy = supportsCopy && copy const [styles, setStyles] = useState({}) const [checked, setChecked] = useState({}) const [initialized, setInitialized] = useState(false) + const [copySuccess, setCopySuccess] = useState(false) + const [otherState, setOtherState] = useState({}) + const setOther = (id, value) => setOtherState({ ...otherState, [id]: value }) + const onClickCopy = () => { + copyAreaRef.current.value = getRawContent(contentRef) + copyToClipboard(copyAreaRef, setCopySuccess) + } const getCss = (id, checkedOptions) => { const checkedForId = checkedOptions[id] || [] @@ -32,7 +64,7 @@ const Quickstart = ({ data, title, description, id, children }) => { if (!initialized) { const initialChecked = Object.assign( {}, - ...data.map(({ id, options }) => ({ + ...data.map(({ id, options = [] }) => ({ [id]: options.filter(option => option.checked).map(({ id }) => id), })) ) @@ -48,7 +80,7 @@ const Quickstart = ({ data, title, description, id, children }) => { return !data.length ? null : (
-
+
{title && (

{title} @@ -57,82 +89,154 @@ const Quickstart = ({ data, title, description, id, children }) => { {description &&

{description}

} - {data.map(({ id, title, options = [], multiple, help }) => ( -
- -
- {title} - {help && ( - - {' '} - - - )} -
-
- {options.map(option => { - const optionType = multiple ? 'checkbox' : 'radio' - const checkedForId = checked[id] || [] - return ( - - { - const newChecked = { - ...checked, - [id]: getNewChecked( - option.id, - checkedForId, - multiple - ), + {data.map( + ({ + id, + title, + options = [], + dropdown = [], + defaultValue, + multiple, + other, + help, + }) => { + // Optional function that's called with the value + const setterFunc = setters[id] || (() => {}) + return ( +
+ +
+ {title} + {help && ( + + {' '} + + + )} +
+
+ {!!dropdown.length && ( + + )} + {other && otherState[id] && ( + setterFunc(target.value)} + /> + )} + {options.map(option => { + const optionType = multiple ? 'checkbox' : 'radio' + const checkedForId = checked[id] || [] + return ( + + { + const newChecked = { + ...checked, + [id]: getNewChecked( + option.id, + checkedForId, + multiple + ), + } + setChecked(newChecked) + setStyles({ + ...styles, + [id]: getCss(id, newChecked), + }) + setterFunc(newChecked[id]) + }} + type={optionType} + className={classNames( + classes.input, + classes[optionType] + )} + name={id} + id={`quickstart-${option.id}`} + value={option.id} + checked={checkedForId.includes(option.id)} + /> + - - ) - })} -
-
- ))} + {option.title} + {option.meta && ( + + {option.meta} + + )} + {option.help && ( + + {' '} + + + )} + +
+ ) + })} +
+
+ ) + } + )}
-                    
+                    
                         {children}
                     
+
+                    
+                        {showCopy && (
+                            
+                        )}
+                        {download && (
+                            
+                                
+                            
+                        )}
+                    
                 
+ {showCopy &&