diff --git a/Makefile b/Makefile index d44063f83..a4df0f8c8 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash ifndef SPACY_EXTRAS -override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core +override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core endif ifndef PYVER diff --git a/pyproject.toml b/pyproject.toml index 6d3a29fe9..611a95d27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a41,<8.0.0a50", + "thinc>=8.0.0a43,<8.0.0a50", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index 3ff8bea3d..44dad38e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a41,<8.0.0a50 +thinc>=8.0.0a43,<8.0.0a50 blis>=0.4.0,<0.5.0 ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.0,<1.1.0 -srsly>=2.1.0,<3.0.0 +srsly>=2.3.0,<3.0.0 catalogue>=2.0.1,<2.1.0 typer>=0.3.0,<0.4.0 pathy diff --git a/setup.cfg b/setup.cfg index 92732dc33..963ce60ca 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,16 +34,16 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a41,<8.0.0a50 + thinc>=8.0.0a43,<8.0.0a50 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a41,<8.0.0a50 + thinc>=8.0.0a43,<8.0.0a50 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 - srsly>=2.1.0,<3.0.0 + srsly>=2.3.0,<3.0.0 catalogue>=2.0.1,<2.1.0 typer>=0.3.0,<0.4.0 pathy @@ -65,7 +65,7 @@ console_scripts = [options.extras_require] lookups = - spacy_lookups_data==0.4.0.dev0 + spacy_lookups_data==1.0.0rc0 cuda = cupy>=5.0.0b4,<9.0.0 cuda80 = @@ -84,7 +84,7 @@ cuda102 = cupy-cuda102>=5.0.0b4,<9.0.0 # Language tokenizers with external dependencies ja = - sudachipy>=0.4.5 + sudachipy>=0.4.9 sudachidict_core>=20200330 ko = natto-py==0.9.0 @@ -98,7 +98,7 @@ universal = false formats = gztar [flake8] -ignore = E203, E266, E501, E731, W503 +ignore = E203, E266, E501, E731, W503, E741 max-line-length = 80 select = B,C,E,F,W,T4,B9 exclude = diff --git a/spacy/about.py b/spacy/about.py index fbe772d25..acf386ace 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a26" +__version__ = "3.0.0a29" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 92cb76971..7368bcef3 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -15,7 +15,7 @@ from .debug_config import debug_config # noqa: F401 from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 -from .init_model import init_model # noqa: F401 +from .init_pipeline import init_pipeline_cli # noqa: F401 from .init_config import init_config, fill_config # noqa: F401 from .validate import validate # noqa: F401 from .project.clone import project_clone # noqa: F401 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 68cb572ea..69c32bbad 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -10,12 +10,13 @@ from click import NoSuchOption from click.parser import split_arg_string from typer.main import get_command from contextlib import contextmanager -from thinc.api import Config, ConfigValidationError +from thinc.api import Config, ConfigValidationError, require_gpu from configparser import InterpolationError import os from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger +from ..util import ENV_VARS if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data, and custom model implementations. """ INIT_HELP = """Commands for initializing configs and pipeline packages.""" -OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES" # Wrappers for Typer's annotations. Initially created to set defaults and to # keep the names short, but not needed at the moment. @@ -65,7 +65,7 @@ def setup_cli() -> None: def parse_config_overrides( - args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR + args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES ) -> Dict[str, Any]: """Generate a dictionary of config overrides based on the extra arguments provided on the CLI, e.g. --training.batch_size to override @@ -275,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None: msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) -def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: - """RETURNS (List[str]): All sourced components in the original config, - e.g. {"source": "en_core_web_sm"}. If the config contains a key - "factory", we assume it refers to a component factory. - """ - return [ - name - for name, cfg in config.get("components", {}).items() - if "factory" not in cfg and "source" in cfg - ] - - def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: """Upload a file. @@ -458,3 +446,12 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in p = int(p) result.append(p) return result + + +def setup_gpu(use_gpu: int) -> None: + """Configure the GPU and log info.""" + if use_gpu >= 0: + msg.info(f"Using GPU: {use_gpu}") + require_gpu(use_gpu) + else: + msg.info("Using CPU") diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 3fc530822..e4559929e 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -9,7 +9,8 @@ import sys from ._util import app, Arg, Opt from ..training import docs_to_json from ..tokens import DocBin -from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs +from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs +from ..training.converters import conllu_to_docs # Converters are matched by file extension except for ner/iob, which are diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 131fecf6d..a6c7345f0 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -1,12 +1,14 @@ from typing import Optional, Dict, Any, Union, List from pathlib import Path from wasabi import msg, table -from thinc.api import Config, ConfigValidationError +from thinc.api import Config from thinc.config import VARIABLE_RE import typer from ._util import Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli +from ..schemas import ConfigSchemaTraining +from ..util import registry from .. import util @@ -52,10 +54,10 @@ def debug_config( with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) nlp = util.load_model_from_config(config) - # Use the resolved config here in case user has one function returning - # a dict of corpora etc. - resolved = util.resolve_training_config(nlp.config) - check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"]) + config = nlp.config.interpolate() + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + dot_names = [T["train_corpus"], T["dev_corpus"]] + util.resolve_dot_names(config, dot_names) msg.good("Config is valid") if show_vars: variables = get_variables(config) @@ -97,23 +99,3 @@ def get_variables(config: Config) -> Dict[str, Any]: value = util.dot_to_object(config, path) result[variable] = repr(value) return result - - -def check_section_refs(config: Config, fields: List[str]) -> None: - """Validate fields in the config that refer to other sections or values - (e.g. in the corpora) and make sure that those references exist. - """ - errors = [] - for field in fields: - # If the field doesn't exist in the config, we ignore it - try: - value = util.dot_to_object(config, field) - except KeyError: - continue - try: - util.dot_to_object(config, value) - except KeyError: - msg = f"not a valid section reference: {value}" - errors.append({"loc": field.split("."), "msg": msg}) - if errors: - raise ConfigValidationError(config=config, errors=errors) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 302bfd563..3dc8d262d 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg import typer from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides -from ._util import import_code, debug_cli, get_sourced_components -from ..training import Corpus, Example +from ._util import import_code, debug_cli +from ..training import Example +from ..training.initialize import get_sourced_components +from ..schemas import ConfigSchemaTraining from ..pipeline._parser_internals import nonproj from ..language import Language +from ..util import registry, resolve_dot_names from .. import util @@ -24,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000 @debug_cli.command( - "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, + "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} ) @app.command( "debug-data", @@ -34,8 +37,6 @@ BLANK_MODEL_THRESHOLD = 2000 def debug_data_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), - dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), config_path: Path = Arg(..., help="Path to config file", exists=True), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), @@ -59,8 +60,6 @@ def debug_data_cli( overrides = parse_config_overrides(ctx.args) import_code(code_path) debug_data( - train_path, - dev_path, config_path, config_overrides=overrides, ignore_warnings=ignore_warnings, @@ -71,8 +70,6 @@ def debug_data_cli( def debug_data( - train_path: Path, - dev_path: Path, config_path: Path, *, config_overrides: Dict[str, Any] = {}, @@ -85,57 +82,29 @@ def debug_data( no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings ) # Make sure all files and paths exists if they are needed - if not train_path.exists(): - msg.fail("Training data not found", train_path, exits=1) - if not dev_path.exists(): - msg.fail("Development data not found", dev_path, exits=1) - if not config_path.exists(): - msg.fail("Config file not found", config_path, exists=1) with show_validation_error(config_path): cfg = util.load_config(config_path, overrides=config_overrides) nlp = util.load_model_from_config(cfg) - C = util.resolve_training_config(nlp.config) + config = nlp.config.interpolate() + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) - frozen_components = C["training"]["frozen_components"] + frozen_components = T["frozen_components"] resume_components = [p for p in sourced_components if p not in frozen_components] pipeline = nlp.pipe_names factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] - tag_map_path = util.ensure_path(C["training"]["tag_map"]) - tag_map = {} - if tag_map_path is not None: - tag_map = srsly.read_json(tag_map_path) - morph_rules_path = util.ensure_path(C["training"]["morph_rules"]) - morph_rules = {} - if morph_rules_path is not None: - morph_rules = srsly.read_json(morph_rules_path) - # Replace tag map with provided mapping - nlp.vocab.morphology.load_tag_map(tag_map) - # Load morph rules - nlp.vocab.morphology.load_morph_exceptions(morph_rules) - msg.divider("Data file validation") # Create the gold corpus to be able to better analyze data - loading_train_error_message = "" - loading_dev_error_message = "" - with msg.loading("Loading corpus..."): - try: - train_dataset = list(Corpus(train_path)(nlp)) - except ValueError as e: - loading_train_error_message = f"Training data cannot be loaded: {e}" - try: - dev_dataset = list(Corpus(dev_path)(nlp)) - except ValueError as e: - loading_dev_error_message = f"Development data cannot be loaded: {e}" - if loading_train_error_message or loading_dev_error_message: - if loading_train_error_message: - msg.fail(loading_train_error_message) - if loading_dev_error_message: - msg.fail(loading_dev_error_message) - sys.exit(1) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(config, dot_names) + train_dataset = list(train_corpus(nlp)) + dev_dataset = list(dev_corpus(nlp)) msg.good("Corpus is loadable") + nlp.initialize(lambda: train_dataset) + msg.good("Pipeline can be initialized with data") + # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True) gold_train_unpreprocessed_data = _compile_gold( @@ -145,10 +114,10 @@ def debug_data( train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] - frozen_components = C["training"]["frozen_components"] + frozen_components = T["frozen_components"] msg.divider("Training stats") - msg.text(f"Language: {C['nlp']['lang']}") + msg.text(f"Language: {nlp.lang}") msg.text(f"Training pipeline: {', '.join(pipeline)}") if resume_components: msg.text(f"Components from other pipelines: {', '.join(resume_components)}") @@ -355,17 +324,12 @@ def debug_data( if "tagger" in factory_names: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] - tag_map = nlp.vocab.morphology.tag_map - msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)") + # TODO: does this need to be updated? + msg.info(f"{len(labels)} label(s) in data") labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) - non_tagmap = [l for l in labels if l not in tag_map] - if not non_tagmap: - msg.good(f"All labels present in tag map for language '{nlp.lang}'") - for label in non_tagmap: - msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'") if "parser" in factory_names: has_low_data_warning = False diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 6f554ed2d..3b8ba7dae 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -2,18 +2,23 @@ from typing import Dict, Any, Optional, Iterable from pathlib import Path from spacy.training import Example -from spacy.util import dot_to_object +from spacy.util import resolve_dot_names from wasabi import msg -from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam +from thinc.api import fix_random_seed, set_dropout_rate, Adam from thinc.api import Model, data_validation, set_gpu_allocator import typer from ._util import Arg, Opt, debug_cli, show_validation_error -from ._util import parse_config_overrides, string_to_list +from ._util import parse_config_overrides, string_to_list, setup_gpu +from ..schemas import ConfigSchemaTraining +from ..util import registry from .. import util -@debug_cli.command("model") +@debug_cli.command( + "model", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) def debug_model_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments @@ -37,11 +42,7 @@ def debug_model_cli( DOCS: https://nightly.spacy.io/api/cli#debug-model """ - if use_gpu >= 0: - msg.info("Using GPU") - require_gpu(use_gpu) - else: - msg.info("Using CPU") + setup_gpu(use_gpu) layers = string_to_list(layers, intify=True) print_settings = { "dimensions": dimensions, @@ -59,14 +60,15 @@ def debug_model_cli( raw_config = util.load_config( config_path, overrides=config_overrides, interpolate=False ) - config = raw_config.iterpolate() + config = raw_config.interpolate() allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) with show_validation_error(config_path): nlp = util.load_model_from_config(raw_config) - C = util.resolve_training_config(nlp.config) - seed = C["training"]["seed"] + config = nlp.config.interpolate() + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + seed = T["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) @@ -77,11 +79,16 @@ def debug_model_cli( exits=1, ) model = pipe.model - debug_model(C, nlp, model, print_settings=print_settings) + debug_model(config, T, nlp, model, print_settings=print_settings) def debug_model( - config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None + config, + resolved_train_config, + nlp, + model: Model, + *, + print_settings: Optional[Dict[str, Any]] = None, ): if not isinstance(model, Model): msg.fail( @@ -102,13 +109,16 @@ def debug_model( # The output vector might differ from the official type of the output layer with data_validation(False): try: - train_corpus = dot_to_object(config, config["training"]["train_corpus"]) - nlp.begin_training(lambda: train_corpus(nlp)) + dot_names = [resolved_train_config["train_corpus"]] + with show_validation_error(): + (train_corpus,) = resolve_dot_names(config, dot_names) + nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") except ValueError: try: _set_output_dim(nO=7, model=model) - nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X]) + with show_validation_error(): + nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X]) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index f9954d9ad..566820283 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -3,11 +3,11 @@ from wasabi import Printer from pathlib import Path import re import srsly -from thinc.api import require_gpu, fix_random_seed +from thinc.api import fix_random_seed from ..training import Corpus from ..tokens import Doc -from ._util import app, Arg, Opt +from ._util import app, Arg, Opt, setup_gpu, import_code from ..scorer import Scorer from .. import util from .. import displacy @@ -19,6 +19,7 @@ def evaluate_cli( model: str = Arg(..., help="Model name or path"), data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), @@ -37,6 +38,7 @@ def evaluate_cli( DOCS: https://nightly.spacy.io/api/cli#evaluate """ + import_code(code_path) evaluate( model, data_path, @@ -61,8 +63,7 @@ def evaluate( ) -> Scorer: msg = Printer(no_print=silent, pretty=not silent) fix_random_seed() - if use_gpu >= 0: - require_gpu(use_gpu) + setup_gpu(use_gpu) data_path = util.ensure_path(data_path) output_path = util.ensure_path(output) displacy_path = util.ensure_path(displacy_path) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py deleted file mode 100644 index 05bf99ccd..000000000 --- a/spacy/cli/init_model.py +++ /dev/null @@ -1,360 +0,0 @@ -from typing import Optional, List, Dict, Any, Union, IO -import math -from tqdm import tqdm -import numpy -from ast import literal_eval -from pathlib import Path -from preshed.counter import PreshCounter -import tarfile -import gzip -import zipfile -import srsly -import warnings -from wasabi import msg, Printer -import typer - -from ._util import app, init_cli, Arg, Opt -from ..vectors import Vectors -from ..errors import Errors, Warnings -from ..language import Language -from ..util import ensure_path, get_lang_class, load_model, OOV_RANK - -try: - import ftfy -except ImportError: - ftfy = None - - -DEFAULT_OOV_PROB = -20 - - -@init_cli.command("vocab") -@app.command( - "init-model", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, - hidden=True, # hide this from main CLI help but still allow it to work with warning -) -def init_model_cli( - # fmt: off - ctx: typer.Context, # This is only used to read additional arguments - lang: str = Arg(..., help="Pipeline language"), - output_dir: Path = Arg(..., help="Pipeline output directory"), - freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True), - clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True), - jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True), - vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True), - prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"), - truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), - vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), - model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"), - base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)") - # fmt: on -): - """ - Create a new blank pipeline directory with vocab and vectors from raw data. - If vectors are provided in Word2Vec format, they can be either a .txt or - zipped as a .zip or .tar.gz. - - DOCS: https://nightly.spacy.io/api/cli#init-vocab - """ - if ctx.command.name == "init-model": - msg.warn( - "The init-model command is now called 'init vocab'. You can run " - "'python -m spacy init --help' for an overview of the other " - "available initialization commands." - ) - init_model( - lang, - output_dir, - freqs_loc=freqs_loc, - clusters_loc=clusters_loc, - jsonl_loc=jsonl_loc, - vectors_loc=vectors_loc, - prune_vectors=prune_vectors, - truncate_vectors=truncate_vectors, - vectors_name=vectors_name, - model_name=model_name, - base_model=base_model, - silent=False, - ) - - -def init_model( - lang: str, - output_dir: Path, - freqs_loc: Optional[Path] = None, - clusters_loc: Optional[Path] = None, - jsonl_loc: Optional[Path] = None, - vectors_loc: Optional[Path] = None, - prune_vectors: int = -1, - truncate_vectors: int = 0, - vectors_name: Optional[str] = None, - model_name: Optional[str] = None, - base_model: Optional[str] = None, - silent: bool = True, -) -> Language: - msg = Printer(no_print=silent, pretty=not silent) - if jsonl_loc is not None: - if freqs_loc is not None or clusters_loc is not None: - settings = ["-j"] - if freqs_loc: - settings.append("-f") - if clusters_loc: - settings.append("-c") - msg.warn( - "Incompatible arguments", - "The -f and -c arguments are deprecated, and not compatible " - "with the -j argument, which should specify the same " - "information. Either merge the frequencies and clusters data " - "into the JSONL-formatted file (recommended), or use only the " - "-f and -c files, without the other lexical attributes.", - ) - jsonl_loc = ensure_path(jsonl_loc) - lex_attrs = srsly.read_jsonl(jsonl_loc) - else: - clusters_loc = ensure_path(clusters_loc) - freqs_loc = ensure_path(freqs_loc) - if freqs_loc is not None and not freqs_loc.exists(): - msg.fail("Can't find words frequencies file", freqs_loc, exits=1) - lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc) - - with msg.loading("Creating blank pipeline..."): - nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) - - msg.good("Successfully created blank pipeline") - if vectors_loc is not None: - add_vectors( - msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name - ) - vec_added = len(nlp.vocab.vectors) - lex_added = len(nlp.vocab) - msg.good( - "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors", - ) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - return nlp - - -def open_file(loc: Union[str, Path]) -> IO: - """Handle .gz, .tar.gz or unzipped files""" - loc = ensure_path(loc) - if tarfile.is_tarfile(str(loc)): - return tarfile.open(str(loc), "r:gz") - elif loc.parts[-1].endswith("gz"): - return (line.decode("utf8") for line in gzip.open(str(loc), "r")) - elif loc.parts[-1].endswith("zip"): - zip_file = zipfile.ZipFile(str(loc)) - names = zip_file.namelist() - file_ = zip_file.open(names[0]) - return (line.decode("utf8") for line in file_) - else: - return loc.open("r", encoding="utf8") - - -def read_attrs_from_deprecated( - msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path] -) -> List[Dict[str, Any]]: - if freqs_loc is not None: - with msg.loading("Counting frequencies..."): - probs, _ = read_freqs(freqs_loc) - msg.good("Counted frequencies") - else: - probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841 - if clusters_loc: - with msg.loading("Reading clusters..."): - clusters = read_clusters(clusters_loc) - msg.good("Read clusters") - else: - clusters = {} - lex_attrs = [] - sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True) - if len(sorted_probs): - for i, (word, prob) in tqdm(enumerate(sorted_probs)): - attrs = {"orth": word, "id": i, "prob": prob} - # Decode as a little-endian string, so that we can do & 15 to get - # the first 4 bits. See _parse_features.pyx - if word in clusters: - attrs["cluster"] = int(clusters[word][::-1], 2) - else: - attrs["cluster"] = 0 - lex_attrs.append(attrs) - return lex_attrs - - -def create_model( - lang: str, - lex_attrs: List[Dict[str, Any]], - name: Optional[str] = None, - base_model: Optional[Union[str, Path]] = None, -) -> Language: - if base_model: - nlp = load_model(base_model) - # keep the tokenizer but remove any existing pipeline components due to - # potentially conflicting vectors - for pipe in nlp.pipe_names: - nlp.remove_pipe(pipe) - else: - lang_class = get_lang_class(lang) - nlp = lang_class() - for lexeme in nlp.vocab: - lexeme.rank = OOV_RANK - for attrs in lex_attrs: - if "settings" in attrs: - continue - lexeme = nlp.vocab[attrs["orth"]] - lexeme.set_attrs(**attrs) - if len(nlp.vocab): - oov_prob = min(lex.prob for lex in nlp.vocab) - 1 - else: - oov_prob = DEFAULT_OOV_PROB - nlp.vocab.cfg.update({"oov_prob": oov_prob}) - if name: - nlp.meta["name"] = name - return nlp - - -def add_vectors( - msg: Printer, - nlp: Language, - vectors_loc: Optional[Path], - truncate_vectors: int, - prune_vectors: int, - name: Optional[str] = None, -) -> None: - vectors_loc = ensure_path(vectors_loc) - if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): - nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) - for lex in nlp.vocab: - if lex.rank and lex.rank != OOV_RANK: - nlp.vocab.vectors.add(lex.orth, row=lex.rank) - else: - if vectors_loc: - with msg.loading(f"Reading vectors from {vectors_loc}"): - vectors_data, vector_keys = read_vectors( - msg, vectors_loc, truncate_vectors - ) - msg.good(f"Loaded vectors from {vectors_loc}") - else: - vectors_data, vector_keys = (None, None) - if vector_keys is not None: - for word in vector_keys: - if word not in nlp.vocab: - nlp.vocab[word] - if vectors_data is not None: - nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) - if name is None: - # TODO: Is this correct? Does this matter? - nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" - else: - nlp.vocab.vectors.name = name - nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name - if prune_vectors >= 1: - nlp.vocab.prune_vectors(prune_vectors) - - -def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int): - f = open_file(vectors_loc) - f = ensure_shape(f) - shape = tuple(int(size) for size in next(f).split()) - if truncate_vectors >= 1: - shape = (truncate_vectors, shape[1]) - vectors_data = numpy.zeros(shape=shape, dtype="f") - vectors_keys = [] - for i, line in enumerate(tqdm(f)): - line = line.rstrip() - pieces = line.rsplit(" ", vectors_data.shape[1]) - word = pieces.pop(0) - if len(pieces) != vectors_data.shape[1]: - msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1) - vectors_data[i] = numpy.asarray(pieces, dtype="f") - vectors_keys.append(word) - if i == truncate_vectors - 1: - break - return vectors_data, vectors_keys - - -def ensure_shape(lines): - """Ensure that the first line of the data is the vectors shape. - - If it's not, we read in the data and output the shape as the first result, - so that the reader doesn't have to deal with the problem. - """ - first_line = next(lines) - try: - shape = tuple(int(size) for size in first_line.split()) - except ValueError: - shape = None - if shape is not None: - # All good, give the data - yield first_line - yield from lines - else: - # Figure out the shape, make it the first value, and then give the - # rest of the data. - width = len(first_line.split()) - 1 - captured = [first_line] + list(lines) - length = len(captured) - yield f"{length} {width}" - yield from captured - - -def read_freqs( - freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50 -): - counts = PreshCounter() - total = 0 - with freqs_loc.open() as f: - for i, line in enumerate(f): - freq, doc_freq, key = line.rstrip().split("\t", 2) - freq = int(freq) - counts.inc(i + 1, freq) - total += freq - counts.smooth() - log_total = math.log(total) - probs = {} - with freqs_loc.open() as f: - for line in tqdm(f): - freq, doc_freq, key = line.rstrip().split("\t", 2) - doc_freq = int(doc_freq) - freq = int(freq) - if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: - try: - word = literal_eval(key) - except SyntaxError: - # Take odd strings literally. - word = literal_eval(f"'{key}'") - smooth_count = counts.smoother(int(freq)) - probs[word] = math.log(smooth_count) - log_total - oov_prob = math.log(counts.smoother(0)) - log_total - return probs, oov_prob - - -def read_clusters(clusters_loc: Path) -> dict: - clusters = {} - if ftfy is None: - warnings.warn(Warnings.W004) - with clusters_loc.open() as f: - for line in tqdm(f): - try: - cluster, word, freq = line.split() - if ftfy is not None: - word = ftfy.fix_text(word) - except ValueError: - continue - # If the clusterer has only seen the word a few times, its - # cluster is unreliable. - if int(freq) >= 3: - clusters[word] = cluster - else: - clusters[word] = "0" - # Expand clusters with re-casing - for word, cluster in list(clusters.items()): - if word.lower() not in clusters: - clusters[word.lower()] = cluster - if word.title() not in clusters: - clusters[word.title()] = cluster - if word.upper() not in clusters: - clusters[word.upper()] = cluster - return clusters diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py new file mode 100644 index 000000000..1c0233539 --- /dev/null +++ b/spacy/cli/init_pipeline.py @@ -0,0 +1,117 @@ +from typing import Optional +import logging +from pathlib import Path +from wasabi import msg +import typer +import srsly + +from .. import util +from ..training.initialize import init_nlp, convert_vectors +from ..language import Language +from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error +from ._util import import_code, setup_gpu + + +@init_cli.command("vectors") +def init_vectors_cli( + # fmt: off + lang: str = Arg(..., help="The language of the nlp object to create"), + vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True), + output_dir: Path = Arg(..., help="Pipeline output directory"), + prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), + truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), + name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), + # fmt: on +): + """Convert word vectors for use with spaCy. Will export an nlp object that + you can use in the [initialize] block of your config to initialize + a model with vectors. + """ + util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + msg.info(f"Creating blank nlp object for language '{lang}'") + nlp = util.get_lang_class(lang)() + if jsonl_loc is not None: + update_lexemes(nlp, jsonl_loc) + convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name) + msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") + nlp.to_disk(output_dir) + msg.good( + "Saved nlp object with vectors to output directory. You can now use the " + "path to it in your config as the 'vectors' setting in [initialize.vocab].", + output_dir.resolve(), + ) + + +def update_lexemes(nlp: Language, jsonl_loc: Path) -> None: + # Mostly used for backwards-compatibility and may be removed in the future + lex_attrs = srsly.read_jsonl(jsonl_loc) + for attrs in lex_attrs: + if "settings" in attrs: + continue + lexeme = nlp.vocab[attrs["orth"]] + lexeme.set_attrs(**attrs) + + +@init_cli.command( + "nlp", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, + hidden=True, +) +def init_pipeline_cli( + # fmt: off + ctx: typer.Context, # This is only used to read additional arguments + config_path: Path = Arg(..., help="Path to config file", exists=True), + output_path: Path = Arg(..., help="Output directory for the prepared data"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + # fmt: on +): + util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + setup_gpu(use_gpu) + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides) + with show_validation_error(hint_fill=False): + nlp = init_nlp(config, use_gpu=use_gpu) + nlp.to_disk(output_path) + msg.good(f"Saved initialized pipeline to {output_path}") + + +@init_cli.command( + "labels", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def init_labels_cli( + # fmt: off + ctx: typer.Context, # This is only used to read additional arguments + config_path: Path = Arg(..., help="Path to config file", exists=True), + output_path: Path = Arg(..., help="Output directory for the labels"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + # fmt: on +): + """Generate JSON files for the labels in the data. This helps speed up the + training process, since spaCy won't have to preprocess the data to + extract the labels.""" + util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + if not output_path.exists(): + output_path.mkdir() + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + setup_gpu(use_gpu) + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides) + with show_validation_error(hint_fill=False): + nlp = init_nlp(config, use_gpu=use_gpu) + for name, component in nlp.pipeline: + if getattr(component, "label_data", None) is not None: + output_file = output_path / f"{name}.json" + srsly.write_json(output_file, component.label_data) + msg.good(f"Saving {name} labels to {output_file}") + else: + msg.info(f"No labels found for {name}") diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 29e220b95..de9341449 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,25 +1,13 @@ from typing import Optional -import numpy -import time -import re -from collections import Counter from pathlib import Path -from thinc.api import require_gpu, set_gpu_allocator -from thinc.api import set_dropout_rate, to_categorical, fix_random_seed -from thinc.api import Config, CosineDistance, L2Distance from wasabi import msg -import srsly -from functools import partial import typer +import re from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code -from ..ml.models.multi_task import build_cloze_multi_task_model -from ..ml.models.multi_task import build_cloze_characters_multi_task_model -from ..tokens import Doc -from ..attrs import ID -from .. import util -from ..util import dot_to_object +from ._util import import_code, setup_gpu +from ..training.pretrain import pretrain +from ..util import load_config @app.command( @@ -61,15 +49,11 @@ def pretrain_cli( config_overrides = parse_config_overrides(ctx.args) import_code(code_path) verify_cli_args(config_path, output_dir, resume_path, epoch_resume) - if use_gpu >= 0: - msg.info("Using GPU") - require_gpu(use_gpu) - else: - msg.info("Using CPU") + setup_gpu(use_gpu) msg.info(f"Loading config from: {config_path}") with show_validation_error(config_path): - raw_config = util.load_config( + raw_config = load_config( config_path, overrides=config_overrides, interpolate=False ) config = raw_config.interpolate() @@ -89,250 +73,11 @@ def pretrain_cli( resume_path=resume_path, epoch_resume=epoch_resume, use_gpu=use_gpu, + silent=False, ) - - -def pretrain( - config: Config, - output_dir: Path, - resume_path: Optional[Path] = None, - epoch_resume: Optional[int] = None, - use_gpu: int = -1, -): - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) - allocator = config["training"]["gpu_allocator"] - if use_gpu >= 0 and allocator: - set_gpu_allocator(allocator) - nlp = util.load_model_from_config(config) - C = util.resolve_training_config(nlp.config) - P_cfg = C["pretraining"] - corpus = dot_to_object(C, P_cfg["corpus"]) - batcher = P_cfg["batcher"] - model = create_pretraining_model(nlp, C["pretraining"]) - optimizer = C["pretraining"]["optimizer"] - # Load in pretrained weights to resume from - if resume_path is not None: - _resume_model(model, resume_path, epoch_resume) - else: - # Without '--resume-path' the '--epoch-resume' argument is ignored - epoch_resume = 0 - - tracker = ProgressTracker(frequency=10000) - msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") - row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} - msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) - - def _save_model(epoch, is_temp=False): - is_temp_str = ".temp" if is_temp else "" - with model.use_params(optimizer.averages): - with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: - file_.write(model.get_ref("tok2vec").to_bytes()) - log = { - "nr_word": tracker.nr_word, - "loss": tracker.loss, - "epoch_loss": tracker.epoch_loss, - "epoch": epoch, - } - with (output_dir / "log.jsonl").open("a") as file_: - file_.write(srsly.json_dumps(log) + "\n") - - objective = create_objective(P_cfg["objective"]) - # TODO: I think we probably want this to look more like the - # 'create_train_batches' function? - for epoch in range(epoch_resume, P_cfg["max_epochs"]): - for batch_id, batch in enumerate(batcher(corpus(nlp))): - docs = ensure_docs(batch) - loss = make_update(model, docs, optimizer, objective) - progress = tracker.update(epoch, loss, docs) - if progress: - msg.row(progress, **row_settings) - if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0): - _save_model(epoch, is_temp=True) - _save_model(epoch) - tracker.epoch_loss = 0.0 msg.good("Successfully finished pretrain") -def ensure_docs(examples_or_docs): - docs = [] - for eg_or_doc in examples_or_docs: - if isinstance(eg_or_doc, Doc): - docs.append(eg_or_doc) - else: - docs.append(eg_or_doc.reference) - return docs - - -def _resume_model(model, resume_path, epoch_resume): - msg.info(f"Resume training tok2vec from: {resume_path}") - with resume_path.open("rb") as file_: - weights_data = file_.read() - model.get_ref("tok2vec").from_bytes(weights_data) - # Parse the epoch number from the given weight file - model_name = re.search(r"model\d+\.bin", str(resume_path)) - if model_name: - # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' - epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 - msg.info(f"Resuming from epoch: {epoch_resume}") - else: - msg.info(f"Resuming from epoch: {epoch_resume}") - - -def make_update(model, docs, optimizer, objective_func): - """Perform an update over a single batch of documents. - - docs (iterable): A batch of `Doc` objects. - optimizer (callable): An optimizer. - RETURNS loss: A float for the loss. - """ - predictions, backprop = model.begin_update(docs) - loss, gradients = objective_func(model.ops, docs, predictions) - backprop(gradients) - model.finish_update(optimizer) - # Don't want to return a cupy object here - # The gradients are modified in-place by the BERT MLM, - # so we get an accurate loss - return float(loss) - - -def create_objective(config): - """Create the objective for pretraining. - - We'd like to replace this with a registry function but it's tricky because - we're also making a model choice based on this. For now we hard-code support - for two types (characters, vectors). For characters you can specify - n_characters, for vectors you can specify the loss. - - Bleh. - """ - objective_type = config["type"] - if objective_type == "characters": - return partial(get_characters_loss, nr_char=config["n_characters"]) - elif objective_type == "vectors": - if config["loss"] == "cosine": - return partial( - get_vectors_loss, - distance=CosineDistance(normalize=True, ignore_zeros=True), - ) - elif config["loss"] == "L2": - return partial( - get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True) - ) - else: - raise ValueError("Unexpected loss type", config["loss"]) - else: - raise ValueError("Unexpected objective_type", objective_type) - - -def get_vectors_loss(ops, docs, prediction, distance): - """Compute a loss based on a distance between the documents' vectors and - the prediction. - """ - # The simplest way to implement this would be to vstack the - # token.vector values, but that's a bit inefficient, especially on GPU. - # Instead we fetch the index into the vectors table for each of our tokens, - # and look them up all at once. This prevents data copying. - ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) - target = docs[0].vocab.vectors.data[ids] - d_target, loss = distance(prediction, target) - return loss, d_target - - -def get_characters_loss(ops, docs, prediction, nr_char): - """Compute a loss based on a number of characters predicted from the docs.""" - target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs]) - target_ids = target_ids.reshape((-1,)) - target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") - target = target.reshape((-1, 256 * nr_char)) - diff = prediction - target - loss = (diff ** 2).sum() - d_target = diff / float(prediction.shape[0]) - return loss, d_target - - -def create_pretraining_model(nlp, pretrain_config): - """Define a network for the pretraining. We simply add an output layer onto - the tok2vec input model. The tok2vec input model needs to be a model that - takes a batch of Doc objects (as a list), and returns a list of arrays. - Each array in the output needs to have one row per token in the doc. - The actual tok2vec layer is stored as a reference, and only this bit will be - serialized to file and read back in when calling the 'train' command. - """ - component = nlp.get_pipe(pretrain_config["component"]) - if pretrain_config.get("layer"): - tok2vec = component.model.get_ref(pretrain_config["layer"]) - else: - tok2vec = component.model - - # TODO - maxout_pieces = 3 - hidden_size = 300 - if pretrain_config["objective"]["type"] == "vectors": - model = build_cloze_multi_task_model( - nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces - ) - elif pretrain_config["objective"]["type"] == "characters": - model = build_cloze_characters_multi_task_model( - nlp.vocab, - tok2vec, - hidden_size=hidden_size, - maxout_pieces=maxout_pieces, - nr_char=pretrain_config["objective"]["n_characters"], - ) - model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) - set_dropout_rate(model, pretrain_config["dropout"]) - return model - - -class ProgressTracker: - def __init__(self, frequency=1000000): - self.loss = 0.0 - self.prev_loss = 0.0 - self.nr_word = 0 - self.words_per_epoch = Counter() - self.frequency = frequency - self.last_time = time.time() - self.last_update = 0 - self.epoch_loss = 0.0 - - def update(self, epoch, loss, docs): - self.loss += loss - self.epoch_loss += loss - words_in_batch = sum(len(doc) for doc in docs) - self.words_per_epoch[epoch] += words_in_batch - self.nr_word += words_in_batch - words_since_update = self.nr_word - self.last_update - if words_since_update >= self.frequency: - wps = words_since_update / (time.time() - self.last_time) - self.last_update = self.nr_word - self.last_time = time.time() - loss_per_word = self.loss - self.prev_loss - status = ( - epoch, - self.nr_word, - _smart_round(self.loss, width=10), - _smart_round(loss_per_word, width=6), - int(wps), - ) - self.prev_loss = float(self.loss) - return status - else: - return None - - -def _smart_round(figure, width=10, max_decimal=4): - """Round large numbers as integers, smaller numbers as decimals.""" - n_digits = len(str(int(figure))) - n_decimal = width - (n_digits + 1) - if n_decimal <= 1: - return str(int(figure)) - else: - n_decimal = min(n_decimal, max_decimal) - format_str = "%." + str(n_decimal) + "f" - return format_str % figure - - def verify_cli_args(config_path, output_dir, resume_path, epoch_resume): if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py index 541253234..6eedc9c20 100644 --- a/spacy/cli/project/dvc.py +++ b/spacy/cli/project/dvc.py @@ -134,7 +134,7 @@ def update_dvc_config( def run_dvc_commands( - commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}, + commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {} ) -> None: """Run a sequence of DVC commands in a subprocess, in order. diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 56faeebfa..3bd237b0a 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements. {%- set use_transformer = (transformer_data and hardware != "cpu") -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} [paths] -train = "" -dev = "" +train = null +dev = null [system] {% if use_transformer -%} @@ -37,6 +37,22 @@ tokenizer_config = {"use_fast": true} window = 128 stride = 96 +{% if "morphologizer" in components %} +[components.morphologizer] +factory = "morphologizer" + +[components.morphologizer.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.morphologizer.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.morphologizer.model.tok2vec.pooling] +@layers = "reduce_mean.v1" +{%- endif %} + {% if "tagger" in components %} [components.tagger] factory = "tagger" @@ -166,6 +182,19 @@ depth = {{ 4 if optimize == "efficiency" else 8 }} window_size = 1 maxout_pieces = 3 +{% if "morphologizer" in components %} +[components.morphologizer] +factory = "morphologizer" + +[components.morphologizer.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.morphologizer.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +{%- endif %} + {% if "tagger" in components %} [components.tagger] factory = "tagger" @@ -257,7 +286,7 @@ no_output_layer = false {% endif %} {% for pipe in components %} -{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %} +{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %} {# Other components defined by the user: we just assume they're factories #} [components.{{ pipe }}] factory = "{{ pipe }}" @@ -270,7 +299,6 @@ factory = "{{ pipe }}" @readers = "spacy.Corpus.v1" path = ${paths.train} max_length = {{ 500 if hardware == "gpu" else 2000 }} -augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5} [corpora.dev] @readers = "spacy.Corpus.v1" @@ -278,11 +306,6 @@ path = ${paths.dev} max_length = 0 [training] -{% if use_transformer or optimize == "efficiency" or not word_vectors -%} -vectors = null -{% else -%} -vectors = "{{ word_vectors }}" -{% endif -%} {% if use_transformer -%} accumulate_gradient = {{ transformer["size_factor"] }} {% endif -%} @@ -318,3 +341,10 @@ start = 100 stop = 1000 compound = 1.001 {% endif %} + +[initialize] +{% if use_transformer or optimize == "efficiency" or not word_vectors -%} +vectors = null +{% else -%} +vectors = "{{ word_vectors }}" +{% endif -%} diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8aef11e02..57a88159d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,23 +1,14 @@ -from typing import Optional, Dict, Any, Tuple, Union, Callable, List -from timeit import default_timer as timer -import srsly -import tqdm +from typing import Optional from pathlib import Path from wasabi import msg -import thinc -import thinc.schedules -from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator -import random import typer import logging from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, get_sourced_components -from ..language import Language +from ._util import import_code, setup_gpu +from ..training.loop import train +from ..training.initialize import init_nlp from .. import util -from ..training.example import Example -from ..errors import Errors -from ..util import dot_to_object @app.command( @@ -30,8 +21,7 @@ def train_cli( output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - resume: bool = Opt(False, "--resume", "-R", help="Resume training"), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): """ @@ -48,393 +38,19 @@ def train_cli( DOCS: https://nightly.spacy.io/api/cli#train """ - util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR) + util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) verify_cli_args(config_path, output_path) overrides = parse_config_overrides(ctx.args) import_code(code_path) - train( - config_path, - output_path=output_path, - config_overrides=overrides, - use_gpu=use_gpu, - resume_training=resume, - ) - - -def train( - config_path: Path, - output_path: Optional[Path] = None, - config_overrides: Dict[str, Any] = {}, - use_gpu: int = -1, - resume_training: bool = False, -) -> None: - if use_gpu >= 0: - msg.info(f"Using GPU: {use_gpu}") - require_gpu(use_gpu) - else: - msg.info("Using CPU") - msg.info(f"Loading config and nlp from: {config_path}") + setup_gpu(use_gpu) with show_validation_error(config_path): - # Keep an un-interpolated config so we can preserve variables in - # the final nlp object we train and serialize - raw_config = util.load_config( - config_path, overrides=config_overrides, interpolate=False - ) - config = raw_config.interpolate() - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) - allocator = config["training"]["gpu_allocator"] - if use_gpu >= 0 and allocator: - set_gpu_allocator(allocator) - # Use original config here before it's resolved to functions - sourced_components = get_sourced_components(config) - with show_validation_error(config_path): - nlp = util.load_model_from_config(raw_config) - # Resolve all training-relevant sections using the filled nlp config - C = util.resolve_training_config(nlp.config) - util.load_vocab_data_into_model(nlp, lookups=C["training"]["lookups"]) - if C["training"]["vectors"] is not None: - add_vectors(nlp, C["training"]["vectors"]) - raw_text, tag_map, morph_rules, weights_data = load_from_paths(C) - T_cfg = C["training"] - optimizer = T_cfg["optimizer"] - train_corpus = dot_to_object(C, T_cfg["train_corpus"]) - dev_corpus = dot_to_object(C, T_cfg["dev_corpus"]) - batcher = T_cfg["batcher"] - train_logger = T_cfg["logger"] - before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"]) - # Components that shouldn't be updated during training - frozen_components = T_cfg["frozen_components"] - # Sourced components that require resume_training - resume_components = [p for p in sourced_components if p not in frozen_components] - msg.info(f"Pipeline: {nlp.pipe_names}") - if resume_components: - with nlp.select_pipes(enable=resume_components): - msg.info(f"Resuming training for: {resume_components}") - nlp.resume_training(sgd=optimizer) - with nlp.select_pipes(disable=[*frozen_components, *resume_components]): - nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) - # Verify the config after calling 'begin_training' to ensure labels are properly initialized - verify_config(nlp) - - if tag_map: - # Replace tag map with provided mapping - nlp.vocab.morphology.load_tag_map(tag_map) - if morph_rules: - # Load morph rules - nlp.vocab.morphology.load_morph_exceptions(morph_rules) - - # Load pretrained tok2vec weights - cf. CLI command 'pretrain' - if weights_data is not None: - tok2vec_component = C["pretraining"]["component"] - if tok2vec_component is None: - msg.fail( - f"To use pretrained tok2vec weights, [pretraining.component] " - f"needs to specify the component that should load them.", - exits=1, - ) - layer = nlp.get_pipe(tok2vec_component).model - tok2vec_layer = C["pretraining"]["layer"] - if tok2vec_layer: - layer = layer.get_ref(tok2vec_layer) - layer.from_bytes(weights_data) - msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'") - - # Create iterator, which yields out info after each optimization step. - msg.info("Start training") - score_weights = T_cfg["score_weights"] - training_step_iterator = train_while_improving( - nlp, - optimizer, - create_train_batches(train_corpus(nlp), batcher, T_cfg["max_epochs"]), - create_evaluation_callback(nlp, dev_corpus, score_weights), - dropout=T_cfg["dropout"], - accumulate_gradient=T_cfg["accumulate_gradient"], - patience=T_cfg["patience"], - max_steps=T_cfg["max_steps"], - eval_frequency=T_cfg["eval_frequency"], - raw_text=None, - exclude=frozen_components, - ) - msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") - with nlp.select_pipes(disable=frozen_components): - print_row, finalize_logger = train_logger(nlp) - - try: - progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) - progress.set_description(f"Epoch 1") - for batch, info, is_best_checkpoint in training_step_iterator: - progress.update(1) - if is_best_checkpoint is not None: - progress.close() - print_row(info) - if is_best_checkpoint and output_path is not None: - with nlp.select_pipes(disable=frozen_components): - update_meta(T_cfg, nlp, info) - with nlp.use_params(optimizer.averages): - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-best") - progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) - progress.set_description(f"Epoch {info['epoch']}") - except Exception as e: - finalize_logger() - if output_path is not None: - # We don't want to swallow the traceback if we don't have a - # specific error. - msg.warn( - f"Aborting and saving the final best model. " - f"Encountered exception: {str(e)}" - ) - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-final") - raise e - finally: - finalize_logger() - if output_path is not None: - final_model_path = output_path / "model-final" - if optimizer.averages: - with nlp.use_params(optimizer.averages): - nlp.to_disk(final_model_path) - else: - nlp.to_disk(final_model_path) - msg.good(f"Saved pipeline to output directory {final_model_path}") - - -def add_vectors(nlp: Language, vectors: str) -> None: - title = f"Config validation error for vectors {vectors}" - desc = ( - "This typically means that there's a problem in the config.cfg included " - "with the packaged vectors. Make sure that the vectors package you're " - "loading is compatible with the current version of spaCy." - ) - with show_validation_error( - title=title, desc=desc, hint_fill=False, show_config=False - ): - util.load_vectors_into_model(nlp, vectors) - - -def create_train_batches(iterator, batcher, max_epochs: int): - epoch = 0 - examples = list(iterator) - if not examples: - # Raise error if no data - raise ValueError(Errors.E986) - while max_epochs < 1 or epoch != max_epochs: - random.shuffle(examples) - for batch in batcher(examples): - yield epoch, batch - epoch += 1 - - -def create_evaluation_callback( - nlp: Language, dev_corpus: Callable, weights: Dict[str, float] -) -> Callable[[], Tuple[float, Dict[str, float]]]: - weights = {key: value for key, value in weights.items() if value is not None} - - def evaluate() -> Tuple[float, Dict[str, float]]: - dev_examples = list(dev_corpus(nlp)) - scores = nlp.evaluate(dev_examples) - # Calculate a weighted sum based on score_weights for the main score. - # We can only consider scores that are ints/floats, not dicts like - # entity scores per type etc. - for key, value in scores.items(): - if key in weights and not isinstance(value, (int, float)): - raise ValueError(Errors.E915.format(name=key, score_type=type(value))) - try: - weighted_score = sum( - scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights - ) - except KeyError as e: - keys = list(scores.keys()) - err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) - raise KeyError(err) from None - return weighted_score, scores - - return evaluate - - -def create_before_to_disk_callback( - callback: Optional[Callable[[Language], Language]] -) -> Callable[[Language], Language]: - def before_to_disk(nlp: Language) -> Language: - if not callback: - return nlp - modified_nlp = callback(nlp) - if not isinstance(modified_nlp, Language): - err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) - raise ValueError(err) - return modified_nlp - - return before_to_disk - - -def train_while_improving( - nlp: Language, - optimizer: Optimizer, - train_data, - evaluate, - *, - dropout: float, - eval_frequency: int, - accumulate_gradient: int, - patience: int, - max_steps: int, - raw_text: List[Dict[str, str]], - exclude: List[str], -): - """Train until an evaluation stops improving. Works as a generator, - with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, - where info is a dict, and is_best_checkpoint is in [True, False, None] -- - None indicating that the iteration was not evaluated as a checkpoint. - The evaluation is conducted by calling the evaluate callback. - - Positional arguments: - nlp: The spaCy pipeline to evaluate. - optimizer: The optimizer callable. - train_data (Iterable[Batch]): A generator of batches, with the training - data. Each batch should be a Sized[Tuple[Input, Annot]]. The training - data iterable needs to take care of iterating over the epochs and - shuffling. - evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation. - The callback should take no arguments and return a tuple - `(main_score, other_scores)`. The main_score should be a float where - higher is better. other_scores can be any object. - - Every iteration, the function yields out a tuple with: - - * batch: A list of Example objects. - * info: A dict with various information about the last update (see below). - * is_best_checkpoint: A value in None, False, True, indicating whether this - was the best evaluation so far. You should use this to save the model - checkpoints during training. If None, evaluation was not conducted on - that iteration. False means evaluation was conducted, but a previous - evaluation was better. - - The info dict provides the following information: - - epoch (int): How many passes over the data have been completed. - step (int): How many steps have been completed. - score (float): The main score from the last evaluation. - other_scores: : The other scores from the last evaluation. - losses: The accumulated losses throughout training. - checkpoints: A list of previous results, where each result is a - (score, step, epoch) tuple. - """ - if isinstance(dropout, float): - dropouts = thinc.schedules.constant(dropout) - else: - dropouts = dropout - results = [] - losses = {} - if raw_text: - random.shuffle(raw_text) - raw_examples = [ - Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text - ] - raw_batches = util.minibatch(raw_examples, size=8) - - words_seen = 0 - start_time = timer() - for step, (epoch, batch) in enumerate(train_data): - dropout = next(dropouts) - for subbatch in subdivide_batch(batch, accumulate_gradient): - - nlp.update( - subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude - ) - if raw_text: - # If raw text is available, perform 'rehearsal' updates, - # which use unlabelled data to reduce overfitting. - raw_batch = list(next(raw_batches)) - nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude) - # TODO: refactor this so we don't have to run it separately in here - for name, proc in nlp.pipeline: - if ( - name not in exclude - and hasattr(proc, "model") - and proc.model not in (True, False, None) - ): - proc.model.finish_update(optimizer) - optimizer.step_schedules() - if not (step % eval_frequency): - if optimizer.averages: - with nlp.use_params(optimizer.averages): - score, other_scores = evaluate() - else: - score, other_scores = evaluate() - results.append((score, step)) - is_best_checkpoint = score == max(results)[0] - else: - score, other_scores = (None, None) - is_best_checkpoint = None - words_seen += sum(len(eg) for eg in batch) - info = { - "epoch": epoch, - "step": step, - "score": score, - "other_scores": other_scores, - "losses": losses, - "checkpoints": results, - "seconds": int(timer() - start_time), - "words": words_seen, - } - yield batch, info, is_best_checkpoint - if is_best_checkpoint is not None: - losses = {} - # Stop if no improvement in `patience` updates (if specified) - best_score, best_step = max(results) - if patience and (step - best_step) >= patience: - break - # Stop if we've exhausted our max steps (if specified) - if max_steps and step >= max_steps: - break - - -def subdivide_batch(batch, accumulate_gradient): - batch = list(batch) - batch.sort(key=lambda eg: len(eg.predicted)) - sub_len = len(batch) // accumulate_gradient - start = 0 - for i in range(accumulate_gradient): - subbatch = batch[start : start + sub_len] - if subbatch: - yield subbatch - start += len(subbatch) - subbatch = batch[start:] - if subbatch: - yield subbatch - - -def update_meta( - training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] -) -> None: - nlp.meta["performance"] = {} - for metric in training["score_weights"]: - if metric is not None: - nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) - for pipe_name in nlp.pipe_names: - nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] - - -def load_from_paths( - config: Config, -) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]: - # TODO: separate checks from loading - raw_text = util.ensure_path(config["training"]["raw_text"]) - if raw_text is not None: - if not raw_text.exists(): - msg.fail("Can't find raw text", raw_text, exits=1) - raw_text = list(srsly.read_jsonl(config["training"]["raw_text"])) - tag_map = {} - morph_rules = {} - weights_data = None - init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"]) - if init_tok2vec is not None: - if not init_tok2vec.exists(): - msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) - with init_tok2vec.open("rb") as file_: - weights_data = file_.read() - return raw_text, tag_map, morph_rules, weights_data + config = util.load_config(config_path, overrides=overrides, interpolate=False) + msg.divider("Initializing pipeline") + with show_validation_error(config_path, hint_fill=False): + nlp = init_nlp(config, use_gpu=use_gpu) + msg.good("Initialized pipeline") + msg.divider("Training pipeline") + train(nlp, output_path, use_gpu=use_gpu, silent=False) def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None: @@ -445,30 +61,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No if not output_path.exists(): output_path.mkdir() msg.good(f"Created output directory: {output_path}") - - -def verify_config(nlp: Language) -> None: - """Perform additional checks based on the config, loaded nlp object and training data.""" - # TODO: maybe we should validate based on the actual components, the list - # in config["nlp"]["pipeline"] instead? - for pipe_config in nlp.config["components"].values(): - # We can't assume that the component name == the factory - factory = pipe_config["factory"] - if factory == "textcat": - verify_textcat_config(nlp, pipe_config) - - -def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None: - # if 'positive_label' is provided: double check whether it's in the data and - # the task is binary - if pipe_config.get("positive_label"): - textcat_labels = nlp.get_pipe("textcat").labels - pos_label = pipe_config.get("positive_label") - if pos_label not in textcat_labels: - raise ValueError( - Errors.E920.format(pos_label=pos_label, labels=textcat_labels) - ) - if len(list(textcat_labels)) != 2: - raise ValueError( - Errors.E919.format(pos_label=pos_label, labels=textcat_labels) - ) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 63a0742e3..d7fc46ea0 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,7 +1,7 @@ [paths] -train = "" -dev = "" -raw = null +train = null +dev = null +vectors = null init_tok2vec = null [system] @@ -10,8 +10,13 @@ gpu_allocator = null [nlp] lang = null +# List of pipeline component names, in order. The names should correspond to +# components defined in the [components block] pipeline = [] +# Components that are loaded but disabled by default disabled = [] +# Optional callbacks to modify the nlp object before it's initialized, after +# it's created and after the pipeline has been set up before_creation = null after_creation = null after_pipeline_creation = null @@ -19,6 +24,7 @@ after_pipeline_creation = null [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" +# The pipeline components and their models [components] # Readers for corpora like dev and train. @@ -37,9 +43,8 @@ max_length = 0 limit = 0 # Apply some simply data augmentation, where we replace tokens with variations. # This is especially useful for punctuation and case replacement, to help -# generalize beyond corpora that don't have smart-quotes, or only have smart -# quotes, etc. -augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5} +# generalize beyond corpora that don't/only have smart quotes etc. +augmenter = null [corpora.dev] @readers = "spacy.Corpus.v1" @@ -52,6 +57,8 @@ gold_preproc = false max_length = 0 # Limitation on number of training examples limit = 0 +# Optional callback for data augmentation +augmenter = null # Training hyper-parameters and additional features. [training] @@ -59,11 +66,6 @@ seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} dropout = 0.1 accumulate_gradient = 1 -# Extra resources for transfer-learning or pseudo-rehearsal -init_tok2vec = ${paths.init_tok2vec} -raw_text = ${paths.raw} -vectors = null -lookups = null # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 max_epochs = 0 @@ -104,3 +106,19 @@ grad_clip = 1.0 use_averages = false eps = 1e-8 learn_rate = 0.001 + +# These settings are used when nlp.initialize() is called (typically before +# training or pretraining). Components and the tokenizer can each define their +# own arguments via their initialize methods that are populated by the config. +# This lets them gather data resources, build label sets etc. +[initialize] +vectors = ${paths.vectors} +# Extra resources for transfer-learning or pseudo-rehearsal +init_tok2vec = ${paths.init_tok2vec} +# Data and lookups for vocabulary +vocab_data = null +lookups = null +# Arguments passed to the tokenizer's initialize method +tokenizer = {} +# Arguments for initialize methods of the components (keyed by component) +components = {} diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg index bbd595308..66987171a 100644 --- a/spacy/default_config_pretraining.cfg +++ b/spacy/default_config_pretraining.cfg @@ -1,3 +1,6 @@ +[paths] +raw_text = null + [pretraining] max_epochs = 1000 dropout = 0.2 @@ -31,8 +34,8 @@ learn_rate = 0.001 [corpora] [corpora.pretrain] -@readers = "spacy.JsonlReader.v1" -path = ${paths.raw} +@readers = "spacy.JsonlCorpus.v1" +path = ${paths.raw_text} min_length = 5 max_length = 500 limit = 0 diff --git a/spacy/errors.py b/spacy/errors.py index 640419182..dbb25479d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -85,6 +85,7 @@ class Warnings: "attribute or operator.") # TODO: fix numbering after merging develop into master + W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.") W090 = ("Could not locate any {format} files in path '{path}'.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") @@ -306,7 +307,7 @@ class Errors: "settings: {opts}") E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}") E109 = ("Component '{name}' could not be run. Did you forget to " - "call begin_training()?") + "call initialize()?") E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}") E111 = ("Pickling a token is not supported, because tokens are only views " "of the parent Doc and can't exist on their own. A pickled token " @@ -376,7 +377,7 @@ class Errors: "provided {found}.") E143 = ("Labels for component '{name}' not initialized. This can be fixed " "by calling add_label, or by providing a representative batch of " - "examples to the component's begin_training method.") + "examples to the component's initialize method.") E145 = ("Error reading `{param}` from input file.") E146 = ("Could not access `{path}`.") E147 = ("Unexpected error in the {method} functionality of the " @@ -418,7 +419,7 @@ class Errors: E164 = ("x is neither increasing nor decreasing: {}.") E165 = ("Only one class present in y_true. ROC AUC score is not defined in " "that case.") - E166 = ("Can only merge DocBins with the same pre-defined attributes.\n" + E166 = ("Can only merge DocBins with the same value for '{param}'.\n" "Current DocBin: {current}\nOther DocBin: {other}") E169 = ("Can't find module: {module}") E170 = ("Cannot apply transition {name}: invalid for the current state.") @@ -476,6 +477,10 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found " + "for mode '{mode}'. Required tables: {tables}. Found: {found}.") + E913 = ("Corpus path can't be None. Maybe you forgot to define it in your " + "config.cfg or override it on the CLI?") E914 = ("Executing {name} callback failed. Expected the function to " "return the nlp object but got: {value}. Maybe you forgot to return " "the modified object in your function?") @@ -517,7 +522,7 @@ class Errors: "but the provided argument {loc} points to a file.") E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " "not seem to exist.") - E930 = ("Received invalid get_examples callback in {name}.begin_training. " + E930 = ("Received invalid get_examples callback in {name}.initialize. " "Expected function that returns an iterable of Example objects but " "got: {obj}") E931 = ("Encountered Pipe subclass without Pipe.{method} method in component " @@ -553,7 +558,10 @@ class Errors: E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " "component.") - E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.") + E955 = ("Can't find table(s) {table} for language '{lang}' in " + "spacy-lookups-data. Make sure you have the package installed or " + "provide your own lookup tables if no default lookups are available " + "for your language.") E956 = ("Can't find component '{name}' in [components] block in the config. " "Available components: {opts}") E957 = ("Writing directly to Language.factories isn't needed anymore in " @@ -670,18 +678,17 @@ class Errors: "'{token_attrs}'.") E999 = ("Unable to merge the `Doc` objects because they do not all share " "the same `Vocab`.") - E1000 = ("No pkuseg model available. Provide a pkuseg model when " - "initializing the pipeline:\n" - 'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n' - 'nlp = Chinese(config=cfg)') + E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was " + "loaded. Provide the name of a pretrained model or the path to " + "a model and initialize the pipeline:\n\n" + 'nlp.tokenizer.initialize(pkuseg_model="default")') E1001 = ("Target token outside of matched span for match with tokens " "'{span}' and offset '{index}' matched by patterns '{patterns}'.") E1002 = ("Span index out of range.") E1003 = ("Unsupported lemmatizer mode '{mode}'.") E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. " - "Required tables '{tables}', found '{found}'. If you are not " - "providing custom lookups, make sure you have the package " - "spacy-lookups-data installed.") + "Required tables: {tables}. Found: {found}. Maybe you forgot to " + "call nlp.initialize() to load in the data?") E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for " "'{chunk}'. Tokenizer exceptions are only allowed to specify " "`ORTH` and `NORM`.") @@ -698,6 +705,9 @@ class Errors: "options: {modes}") E1012 = ("Entity spans and blocked/missing/outside spans should be " "provided to doc.set_ents as lists of `Span` objects.") + E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the " + "token itself. To set the morph from this MorphAnalysis, set from " + "the string value with: `token.set_morph(str(other_morph))`.") @add_codes diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 923e29a17..879229888 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS from ...language import Language -from ...lookups import Lookups from ...pipeline import Lemmatizer @@ -24,18 +23,11 @@ class Bengali(Language): @Bengali.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) - return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return Lemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Bengali"] diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 1a7b19914..53069334e 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .lemmatizer import GreekLemmatizer -from ...lookups import Lookups from ...language import Language @@ -29,18 +28,11 @@ class Greek(Language): @Greek.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups) - return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return GreekLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Greek"] diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index bf7e9987f..3a3ebeefd 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,5 +1,4 @@ from typing import Optional - from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -9,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_INFIXES from .lemmatizer import EnglishLemmatizer from ...language import Language -from ...lookups import Lookups class EnglishDefaults(Language.Defaults): @@ -28,18 +26,11 @@ class English(Language): @English.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups) - return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return EnglishLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["English"] diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py index be389f117..2cb0f9a53 100644 --- a/spacy/lang/en/lemmatizer.py +++ b/spacy/lang/en/lemmatizer.py @@ -3,8 +3,7 @@ from ...tokens import Token class EnglishLemmatizer(Lemmatizer): - """English lemmatizer. Only overrides is_base_form. - """ + """English lemmatizer. Only overrides is_base_form.""" def is_base_form(self, token: Token) -> bool: """ diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index ad0a1b838..4dd4f99be 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -58,7 +58,7 @@ def noun_bounds( doc, token, np_left_deps, np_right_deps, stop_deps ) filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps - if list(filter(filter_func, doc[left_bound.i : right.i],)): + if list(filter(filter_func, doc[left_bound.i : right.i])): break else: right_bound = right diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index f3a6635dc..77ee3bca3 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_SUFFIXES from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...lookups import Lookups from ...pipeline import Lemmatizer @@ -27,18 +26,11 @@ class Persian(Language): @Persian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) - return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return Lemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Persian"] diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 72e641d1f..1e0011fba 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from .lemmatizer import FrenchLemmatizer -from ...lookups import Lookups from ...language import Language @@ -32,18 +31,11 @@ class French(Language): @French.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups) - return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return FrenchLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["French"] diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index e7cc1ef3b..4e6bf9d3c 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any from pathlib import Path import srsly from collections import namedtuple -from thinc.api import Config from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS @@ -16,7 +15,7 @@ from ...scorer import Scorer from ...symbols import POS from ...tokens import Doc from ...training import validate_examples -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str from ... import util @@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer): class JapaneseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index dd07ef89c..83c9f4962 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,5 +1,4 @@ from typing import Optional, Any, Dict -from thinc.api import Config from .stop_words import STOP_WORDS from .tag_map import TAG_MAP @@ -10,7 +9,7 @@ from ...compat import copy_reg from ...scorer import Scorer from ...symbols import POS from ...training import validate_examples -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer): class KoreanDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 9672dfd6e..62d7707f3 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...lookups import Lookups from ...pipeline import Lemmatizer @@ -27,18 +26,11 @@ class Norwegian(Language): @Norwegian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) - return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return Lemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Norwegian"] diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 15b6b9de2..a3591f1bf 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,5 +1,4 @@ from typing import Optional - from thinc.api import Model from .stop_words import STOP_WORDS @@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .lemmatizer import DutchLemmatizer -from ...lookups import Lookups from ...language import Language @@ -29,18 +27,11 @@ class Dutch(Language): @Dutch.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups) - return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return DutchLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Dutch"] diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 573dbc6f9..f7be8a6c2 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -34,18 +34,11 @@ class Polish(Language): @Polish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pos_lookup", "lookups": None}, + default_config={"model": None, "mode": "pos_lookup"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups) - return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return PolishLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Polish"] diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 4a296dd23..1d59ca043 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,5 +1,4 @@ from typing import Optional - from thinc.api import Model from .stop_words import STOP_WORDS @@ -7,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .lemmatizer import RussianLemmatizer from ...language import Language -from ...lookups import Lookups class RussianDefaults(Language.Defaults): @@ -24,17 +22,11 @@ class Russian(Language): @Russian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pymorphy2", "lookups": None}, + default_config={"model": None, "mode": "pymorphy2"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return RussianLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Russian"] diff --git a/spacy/lang/sa/lex_attrs.py b/spacy/lang/sa/lex_attrs.py index f2b51650b..bdceb7ec2 100644 --- a/spacy/lang/sa/lex_attrs.py +++ b/spacy/lang/sa/lex_attrs.py @@ -108,8 +108,8 @@ _num_words = [ def like_num(text): """ - Check if text resembles a number - """ + Check if text resembles a number + """ if text.startswith(("+", "-", "±", "~")): text = text[1:] text = text.replace(",", "").replace(".", "") diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index ea314f487..2490eb9ec 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...lookups import Lookups from ...pipeline import Lemmatizer @@ -30,18 +29,11 @@ class Swedish(Language): @Swedish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) - return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return Lemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Swedish"] diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index a35ae987f..219c50c1a 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -1,10 +1,8 @@ -from thinc.api import Config - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc -from ...util import DummyTokenizer, registry +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -42,7 +40,7 @@ class ThaiTokenizer(DummyTokenizer): class ThaiDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 006a1cf7f..73c065379 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import UkrainianLemmatizer from ...language import Language -from ...lookups import Lookups class UkrainianDefaults(Language.Defaults): @@ -24,17 +23,11 @@ class Ukrainian(Language): @Ukrainian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pymorphy2", "lookups": None}, + default_config={"model": None, "mode": "pymorphy2"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Ukrainian"] diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 1db762adb..1328de495 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -1,10 +1,8 @@ -from thinc.api import Config - +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc -from .stop_words import STOP_WORDS -from ...util import DummyTokenizer, registry -from .lex_attrs import LEX_ATTRS +from ...util import DummyTokenizer, registry, load_config_from_str DEFAULT_CONFIG = """ @@ -17,7 +15,7 @@ use_pyvi = true @registry.tokenizers("spacy.vi.VietnameseTokenizer") -def create_vietnamese_tokenizer(use_pyvi: bool = True,): +def create_vietnamese_tokenizer(use_pyvi: bool = True): def vietnamese_tokenizer_factory(nlp): return VietnameseTokenizer(nlp, use_pyvi=use_pyvi) @@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer): class VietnameseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index fa9bb810d..858f41f65 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,23 +1,25 @@ -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Callable, Iterable from enum import Enum import tempfile import srsly import warnings from pathlib import Path -from thinc.api import Config from ...errors import Warnings, Errors from ...language import Language from ...scorer import Scorer from ...tokens import Doc -from ...training import validate_examples -from ...util import DummyTokenizer, registry +from ...training import validate_examples, Example +from ...util import DummyTokenizer, registry, load_config_from_str from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ... import util -_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python" +# fmt: off +_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`" +_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7." +# fmt: on DEFAULT_CONFIG = """ [nlp] @@ -25,6 +27,10 @@ DEFAULT_CONFIG = """ [nlp.tokenizer] @tokenizers = "spacy.zh.ChineseTokenizer" segmenter = "char" + +[initialize] + +[initialize.tokenizer] pkuseg_model = null pkuseg_user_dict = "default" """ @@ -41,41 +47,23 @@ class Segmenter(str, Enum): @registry.tokenizers("spacy.zh.ChineseTokenizer") -def create_chinese_tokenizer( - segmenter: Segmenter = Segmenter.char, - pkuseg_model: Optional[str] = None, - pkuseg_user_dict: Optional[str] = "default", -): +def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,): def chinese_tokenizer_factory(nlp): - return ChineseTokenizer( - nlp, - segmenter=segmenter, - pkuseg_model=pkuseg_model, - pkuseg_user_dict=pkuseg_user_dict, - ) + return ChineseTokenizer(nlp, segmenter=segmenter) return chinese_tokenizer_factory class ChineseTokenizer(DummyTokenizer): def __init__( - self, - nlp: Language, - segmenter: Segmenter = Segmenter.char, - pkuseg_model: Optional[str] = None, - pkuseg_user_dict: Optional[str] = None, + self, nlp: Language, segmenter: Segmenter = Segmenter.char, ): self.vocab = nlp.vocab - if isinstance(segmenter, Segmenter): # we might have the Enum here + if isinstance(segmenter, Segmenter): segmenter = segmenter.value self.segmenter = segmenter - self.pkuseg_model = pkuseg_model - self.pkuseg_user_dict = pkuseg_user_dict self.pkuseg_seg = None self.jieba_seg = None - self.configure_segmenter(segmenter) - - def configure_segmenter(self, segmenter: str): if segmenter not in Segmenter.values(): warn_msg = Warnings.W103.format( lang="Chinese", @@ -85,12 +73,21 @@ class ChineseTokenizer(DummyTokenizer): ) warnings.warn(warn_msg) self.segmenter = Segmenter.char - self.jieba_seg = try_jieba_import(self.segmenter) - self.pkuseg_seg = try_pkuseg_import( - self.segmenter, - pkuseg_model=self.pkuseg_model, - pkuseg_user_dict=self.pkuseg_user_dict, - ) + if segmenter == Segmenter.jieba: + self.jieba_seg = try_jieba_import() + + def initialize( + self, + get_examples: Optional[Callable[[], Iterable[Example]]] = None, + *, + nlp: Optional[Language] = None, + pkuseg_model: Optional[str] = None, + pkuseg_user_dict: str = "default", + ): + if self.segmenter == Segmenter.pkuseg: + self.pkuseg_seg = try_pkuseg_import( + pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict, + ) def __call__(self, text: str) -> Doc: if self.segmenter == Segmenter.jieba: @@ -145,14 +142,10 @@ class ChineseTokenizer(DummyTokenizer): def _get_config(self) -> Dict[str, Any]: return { "segmenter": self.segmenter, - "pkuseg_model": self.pkuseg_model, - "pkuseg_user_dict": self.pkuseg_user_dict, } def _set_config(self, config: Dict[str, Any] = {}) -> None: self.segmenter = config.get("segmenter", Segmenter.char) - self.pkuseg_model = config.get("pkuseg_model", None) - self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default") def to_bytes(self, **kwargs): pkuseg_features_b = b"" @@ -163,6 +156,22 @@ class ChineseTokenizer(DummyTokenizer): self.pkuseg_seg.feature_extractor.save(tempdir) self.pkuseg_seg.model.save(tempdir) tempdir = Path(tempdir) + # pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which + # means that it will be saved with pickle protocol 5 with + # python 3.8, which can't be reloaded with python 3.6-3.7. + # To try to make the model compatible with python 3.6+, reload + # the data with pickle5 and convert it back to protocol 4. + try: + import pickle5 + + with open(tempdir / "features.pkl", "rb") as fileh: + features = pickle5.load(fileh) + with open(tempdir / "features.pkl", "wb") as fileh: + pickle5.dump(features, fileh, protocol=4) + except ImportError as e: + raise e + except Exception: + warnings.warn(_PKUSEG_PICKLE_WARNING) with open(tempdir / "features.pkl", "rb") as fileh: pkuseg_features_b = fileh.read() with open(tempdir / "weights.npz", "rb") as fileh: @@ -235,6 +244,18 @@ class ChineseTokenizer(DummyTokenizer): path.mkdir(parents=True) self.pkuseg_seg.model.save(path) self.pkuseg_seg.feature_extractor.save(path) + # try to convert features.pkl to pickle protocol 4 + try: + import pickle5 + + with open(path / "features.pkl", "rb") as fileh: + features = pickle5.load(fileh) + with open(path / "features.pkl", "wb") as fileh: + pickle5.dump(features, fileh, protocol=4) + except ImportError as e: + raise e + except Exception: + warnings.warn(_PKUSEG_PICKLE_WARNING) def save_pkuseg_processors(path): if self.pkuseg_seg: @@ -291,7 +312,7 @@ class ChineseTokenizer(DummyTokenizer): class ChineseDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) + config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @@ -302,47 +323,33 @@ class Chinese(Language): Defaults = ChineseDefaults -def try_jieba_import(segmenter: str) -> None: +def try_jieba_import() -> None: try: import jieba - if segmenter == Segmenter.jieba: - # segment a short text to have jieba initialize its cache in advance - list(jieba.cut("作为", cut_all=False)) + # segment a short text to have jieba initialize its cache in advance + list(jieba.cut("作为", cut_all=False)) return jieba except ImportError: - if segmenter == Segmenter.jieba: - msg = ( - "Jieba not installed. To use jieba, install it with `pip " - " install jieba` or from https://github.com/fxsjy/jieba" - ) - raise ImportError(msg) from None + msg = ( + "Jieba not installed. To use jieba, install it with `pip " + " install jieba` or from https://github.com/fxsjy/jieba" + ) + raise ImportError(msg) from None -def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None: +def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None: try: import pkuseg - if pkuseg_model: - return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) - elif segmenter == Segmenter.pkuseg: - msg = ( - "The Chinese word segmenter is 'pkuseg' but no pkuseg model " - "was specified. Please provide the name of a pretrained model " - "or the path to a model with:\n" - 'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n' - "nlp = Chinese.from_config(cfg)" - ) - raise ValueError(msg) + return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict) except ImportError: - if segmenter == Segmenter.pkuseg: - msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG - raise ImportError(msg) from None + msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG + raise ImportError(msg) from None except FileNotFoundError: - if segmenter == Segmenter.pkuseg: - msg = "Unable to load pkuseg model from: " + pkuseg_model - raise FileNotFoundError(msg) from None + msg = "Unable to load pkuseg model from: " + pkuseg_model + raise FileNotFoundError(msg) from None def _get_pkuseg_trie_data(node, path=""): diff --git a/spacy/language.py b/spacy/language.py index f161b2877..18c08258f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -8,7 +8,7 @@ from contextlib import contextmanager from copy import deepcopy from pathlib import Path import warnings -from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer +from thinc.api import Model, get_current_ops, Config, Optimizer import srsly import multiprocessing as mp from itertools import chain, cycle @@ -18,8 +18,9 @@ from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .training import Example, validate_examples +from .training.initialize import init_vocab, init_tok2vec from .scorer import Scorer -from .util import create_default_optimizer, registry, SimpleFrozenList +from .util import registry, SimpleFrozenList from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES @@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc from .tokenizer import Tokenizer from .errors import Errors, Warnings -from .schemas import ConfigSchema, ConfigSchemaNlp +from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit +from .schemas import ConfigSchemaPretrain, validate_init_settings from .git_info import GIT_VERSION from . import util from . import about @@ -1066,7 +1068,7 @@ class Language: validate_examples(examples, "Language.update") if sgd is None: if self._optimizer is None: - self._optimizer = create_default_optimizer() + self._optimizer = self.create_optimizer() sgd = self._optimizer if component_cfg is None: component_cfg = {} @@ -1124,7 +1126,7 @@ class Language: validate_examples(examples, "Language.rehearse") if sgd is None: if self._optimizer is None: - self._optimizer = create_default_optimizer() + self._optimizer = self.create_optimizer() sgd = self._optimizer pipes = list(self.pipeline) random.shuffle(pipes) @@ -1154,61 +1156,73 @@ class Language: get_examples: Optional[Callable[[], Iterable[Example]]] = None, *, sgd: Optional[Optimizer] = None, - device: int = -1, + ) -> Optimizer: + warnings.warn(Warnings.W089, DeprecationWarning) + return self.initialize(get_examples, sgd=sgd) + + def initialize( + self, + get_examples: Optional[Callable[[], Iterable[Example]]] = None, + *, + sgd: Optional[Optimizer] = None, ) -> Optimizer: """Initialize the pipe for training, using data examples if available. get_examples (Callable[[], Iterable[Example]]): Optional function that returns gold-standard Example objects. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. + sgd (Optional[Optimizer]): An optimizer to use for updates. If not + provided, will be created using the .create_optimizer() method. RETURNS (thinc.api.Optimizer): The optimizer. - DOCS: https://nightly.spacy.io/api/language#begin_training + DOCS: https://nightly.spacy.io/api/language#initialize """ if get_examples is None: util.logger.debug( - "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples" + "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples" ) doc = Doc(self.vocab, words=["x", "y", "z"]) get_examples = lambda: [Example.from_dict(doc, {})] - # Populate vocab if not hasattr(get_examples, "__call__"): err = Errors.E930.format(name="Language", obj=type(get_examples)) raise ValueError(err) - valid_examples = False - for example in get_examples(): - if not isinstance(example, Example): - err = Errors.E978.format( - name="Language.begin_training", types=type(example) - ) - raise ValueError(err) - else: - valid_examples = True - for word in [t.text for t in example.reference]: - _ = self.vocab[word] # noqa: F841 - if not valid_examples: - err = Errors.E930.format(name="Language", obj="empty list") - raise ValueError(err) - if device >= 0: # TODO: do we need this here? - require_gpu(device) - if self.vocab.vectors.data.shape[1] >= 1: - ops = get_current_ops() - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - if sgd is None: - sgd = create_default_optimizer() - self._optimizer = sgd + # Make sure the config is interpolated so we can resolve subsections + config = self.config.interpolate() + # These are the settings provided in the [initialize] block in the config + I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + init_vocab( + self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"] + ) + pretrain_cfg = config.get("pretraining") + if pretrain_cfg: + P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) + init_tok2vec(self, P, I) + if self.vocab.vectors.data.shape[1] >= 1: + ops = get_current_ops() + self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) + if hasattr(self.tokenizer, "initialize"): + tok_settings = validate_init_settings( + self.tokenizer.initialize, + I["tokenizer"], + section="tokenizer", + name="tokenizer", + ) + self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) for name, proc in self.pipeline: - if hasattr(proc, "begin_training"): - proc.begin_training( - get_examples, pipeline=self.pipeline, sgd=self._optimizer + if hasattr(proc, "initialize"): + p_settings = I["components"].get(name, {}) + p_settings = validate_init_settings( + proc.initialize, p_settings, section="components", name=name ) + proc.initialize(get_examples, nlp=self, **p_settings) self._link_components() + self._optimizer = sgd + if sgd is not None: + self._optimizer = sgd + elif self._optimizer is None: + self._optimizer = self.create_optimizer() return self._optimizer - def resume_training( - self, *, sgd: Optional[Optimizer] = None, device: int = -1 - ) -> Optimizer: + def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer: """Continue training a pretrained model. Create and return an optimizer, and initialize "rehearsal" for any pipeline @@ -1217,22 +1231,20 @@ class Language: rehearsal, collect samples of text you want the models to retain performance on, and call nlp.rehearse() with a batch of Example objects. - sgd (Optional[Optimizer]): An optimizer. RETURNS (Optimizer): The optimizer. DOCS: https://nightly.spacy.io/api/language#resume_training """ - if device >= 0: # TODO: do we need this here? - require_gpu(device) - ops = get_current_ops() - if self.vocab.vectors.data.shape[1] >= 1: - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - if sgd is None: - sgd = create_default_optimizer() - self._optimizer = sgd + ops = get_current_ops() + if self.vocab.vectors.data.shape[1] >= 1: + self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) for name, proc in self.pipeline: if hasattr(proc, "_rehearsal_model"): proc._rehearsal_model = deepcopy(proc.model) + if sgd is not None: + self._optimizer = sgd + elif self._optimizer is None: + self._optimizer = self.create_optimizer() return self._optimizer def evaluate( @@ -1294,6 +1306,11 @@ class Language: results["speed"] = n_words / (end_time - start_time) return results + def create_optimizer(self): + """Create an optimizer, usually using the [training.optimizer] config.""" + subconfig = {"optimizer": self.config["training"]["optimizer"]} + return registry.resolve(subconfig)["optimizer"] + @contextmanager def use_params(self, params: Optional[dict]): """Replace weights of models in the pipeline with those provided in the @@ -1502,7 +1519,7 @@ class Language: ).merge(config) if "nlp" not in config: raise ValueError(Errors.E985.format(config=config)) - config_lang = config["nlp"]["lang"] + config_lang = config["nlp"].get("lang") if config_lang is not None and config_lang != cls.lang: raise ValueError( Errors.E958.format( diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py new file mode 100644 index 000000000..ed2918f02 --- /dev/null +++ b/spacy/ml/featureextractor.py @@ -0,0 +1,28 @@ +from typing import List, Union, Callable, Tuple +from thinc.types import Ints2d +from thinc.api import Model, registry + +from ..tokens import Doc + + +@registry.layers("spacy.FeatureExtractor.v1") +def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]: + return Model("extract_features", forward, attrs={"columns": columns}) + + +def forward( + model: Model[List[Doc], List[Ints2d]], docs, is_train: bool +) -> Tuple[List[Ints2d], Callable]: + columns = model.attrs["columns"] + features: List[Ints2d] = [] + for doc in docs: + if hasattr(doc, "to_array"): + attrs = doc.to_array(columns) + else: + attrs = doc.doc.to_array(columns)[doc.start : doc.end] + if attrs.ndim == 1: + attrs = attrs.reshape((attrs.shape[0], 1)) + features.append(model.ops.asarray2i(attrs, dtype="uint64")) + + backprop: Callable[[List[Ints2d]], List] = lambda d_features: [] + return features, backprop diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 16293cda4..1117b4fde 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum from thinc.api import HashEmbed, with_array, with_cpu, uniqued -from thinc.api import Relu, residual, expand_window, FeatureExtractor +from thinc.api import Relu, residual, expand_window from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry from ..extract_ngrams import extract_ngrams from ..staticvectors import StaticVectors +from ..featureextractor import FeatureExtractor @registry.architectures.register("spacy.TextCatCNN.v1") diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 63e79bf95..2870de1b9 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,16 +1,16 @@ -from typing import Optional, List -from thinc.api import chain, clone, concatenate, with_array, with_padded -from thinc.api import Model, noop, list2ragged, ragged2list -from thinc.api import FeatureExtractor, HashEmbed -from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM +from typing import Optional, List, Union from thinc.types import Floats2d +from thinc.api import chain, clone, concatenate, with_array, with_padded +from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed +from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM from ...tokens import Doc from ...util import registry from ...ml import _character_embed from ..staticvectors import StaticVectors +from ..featureextractor import FeatureExtractor from ...pipeline.tok2vec import Tok2VecListener -from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE +from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr @registry.architectures.register("spacy.Tok2VecListener.v1") @@ -98,7 +98,7 @@ def MultiHashEmbed( attributes using hash embedding, concatenates the results, and passes it through a feed-forward subnetwork to build a mixed representations. - The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have + The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have varying definitions depending on the Vocab of the Doc object passed in. Vectors from pretrained static vectors can also be incorporated into the concatenated representation. @@ -115,7 +115,7 @@ def MultiHashEmbed( also_use_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ - cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH] + cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH] seed = 7 def make_hash_embed(feature): @@ -123,7 +123,7 @@ def MultiHashEmbed( seed += 1 return HashEmbed( width, - rows if feature == NORM else rows // 2, + rows if feature == LOWER else rows // 2, column=cols.index(feature), seed=seed, dropout=0.0, @@ -131,13 +131,13 @@ def MultiHashEmbed( if also_embed_subwords: embeddings = [ - make_hash_embed(NORM), + make_hash_embed(LOWER), make_hash_embed(PREFIX), make_hash_embed(SUFFIX), make_hash_embed(SHAPE), ] else: - embeddings = [make_hash_embed(NORM)] + embeddings = [make_hash_embed(LOWER)] concat_size = width * (len(embeddings) + also_use_static_vectors) if also_use_static_vectors: model = chain( @@ -165,7 +165,8 @@ def MultiHashEmbed( @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed( - width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool + width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool, + feature: Union[int, str]="LOWER" ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedded representation based on character embeddings, using a feed-forward network. A fixed number of UTF-8 byte characters are used for @@ -179,12 +180,13 @@ def CharacterEmbed( of being in an arbitrary position depending on the word length. The characters are embedded in a embedding table with a given number of rows, - and the vectors concatenated. A hash-embedded vector of the NORM of the word is + and the vectors concatenated. A hash-embedded vector of the LOWER of the word is also concatenated on, and the result is then passed through a feed-forward network to construct a single vector to represent the information. - width (int): The width of the output vector and the NORM hash embedding. - rows (int): The number of rows in the NORM hash embedding table. + feature (int or str): An attribute to embed, to concatenate with the characters. + width (int): The width of the output vector and the feature embedding. + rows (int): The number of rows in the LOWER hash embedding table. nM (int): The dimensionality of the character embeddings. Recommended values are between 16 and 64. nC (int): The number of UTF-8 bytes to embed per word. Recommended values @@ -193,12 +195,15 @@ def CharacterEmbed( also_use_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ + feature = intify_attr(feature) + if feature is None: + raise ValueError("Invalid feature: Must be a token attribute.") if also_use_static_vectors: model = chain( concatenate( chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), chain( - FeatureExtractor([NORM]), + FeatureExtractor([feature]), list2ragged(), with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), ), @@ -214,7 +219,7 @@ def CharacterEmbed( concatenate( chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), chain( - FeatureExtractor([NORM]), + FeatureExtractor([feature]), list2ragged(), with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), ), diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index 008ac3384..d0362e7e1 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]: def analyze_pipes( - nlp: "Language", *, keys: List[str] = DEFAULT_KEYS, + nlp: "Language", *, keys: List[str] = DEFAULT_KEYS ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Print a formatted summary for the current nlp object's pipeline. Shows a table with the pipeline components and why they assign and require, as diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 4243ebcfb..f314953e9 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -82,8 +82,7 @@ class AttributeRuler(Pipe): matches = self.matcher(doc, allow_missing=True) # Sort by the attribute ID, so that later rules have precendence matches = [ - (int(self.vocab.strings[m_id]), m_id, s, e) - for m_id, s, e in matches + (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches ] matches.sort() for attr_id, match_id, start, end in matches: @@ -93,7 +92,7 @@ class AttributeRuler(Pipe): try: # The index can be negative, which makes it annoying to do # the boundscheck. Let Span do it instead. - token = span[index] + token = span[index] # noqa: F841 except IndexError: # The original exception is just our conditional logic, so we # raise from. @@ -103,7 +102,7 @@ class AttributeRuler(Pipe): span=[t.text for t in span], index=index, ) - ) from None + ) from None set_token_attrs(span[index], attrs) return doc diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index a447434d2..bdef332cc 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -126,13 +126,13 @@ cdef class DependencyParser(Parser): def add_multitask_objective(self, mt_component): self._multitasks.append(mt_component) - def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): + def init_multitask_objectives(self, get_examples, nlp=None, **cfg): # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: labeller.model.set_dim("nO", len(self.labels)) if labeller.model.has_ref("output_layer"): labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) - labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd) + labeller.initialize(get_examples, nlp=nlp) @property def labels(self): diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 039e2a891..b67a15d32 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,5 +1,5 @@ from itertools import islice -from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple +from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List from pathlib import Path import srsly import random @@ -140,26 +140,20 @@ class EntityLinker(Pipe): if len(self.kb) == 0: raise ValueError(Errors.E139.format(name=self.name)) - def begin_training( + def initialize( self, get_examples: Callable[[], Iterable[Example]], *, - pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, - sgd: Optional[Optimizer] = None, - ) -> Optimizer: + nlp: Optional[Language] = None, + ): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. - DOCS: https://nightly.spacy.io/api/entitylinker#begin_training + DOCS: https://nightly.spacy.io/api/entitylinker#initialize """ self._ensure_examples(get_examples) self._require_kb() @@ -174,9 +168,6 @@ class EntityLinker(Pipe): self.model.initialize( X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") ) - if sgd is None: - sgd = self.create_optimizer() - return sgd def update( self, diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index c30d09f62..9be596868 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -1,26 +1,25 @@ -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union +from typing import Tuple from thinc.api import Model +from pathlib import Path from .pipe import Pipe from ..errors import Errors from ..language import Language +from ..training import Example from ..lookups import Lookups, load_lookups from ..scorer import Scorer from ..tokens import Doc, Token from ..vocab import Vocab from ..training import validate_examples +from ..util import logger, SimpleFrozenList from .. import util @Language.factory( "lemmatizer", assigns=["token.lemma"], - default_config={ - "model": None, - "mode": "lookup", - "lookups": None, - "overwrite": False, - }, + default_config={"model": None, "mode": "lookup", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( @@ -28,13 +27,9 @@ def make_lemmatizer( model: Optional[Model], name: str, mode: str, - lookups: Optional[Lookups], overwrite: bool = False, ): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) - return Lemmatizer( - nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite - ) + return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) class Lemmatizer(Pipe): @@ -46,59 +41,19 @@ class Lemmatizer(Pipe): """ @classmethod - def get_lookups_config(cls, mode: str) -> Dict: + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: """Returns the lookups configuration settings for a given mode for use in Lemmatizer.load_lookups. mode (str): The lemmatizer mode. - RETURNS (dict): The lookups configuration settings for this mode. - - DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config + RETURNS (Tuple[List[str], List[str]]): The required and optional + lookup tables for this mode. """ if mode == "lookup": - return { - "required_tables": ["lemma_lookup"], - } + return (["lemma_lookup"], []) elif mode == "rule": - return { - "required_tables": ["lemma_rules"], - "optional_tables": ["lemma_exc", "lemma_index"], - } - return {} - - @classmethod - def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups: - """Load and validate lookups tables. If the provided lookups is None, - load the default lookups tables according to the language and mode - settings. Confirm that all required tables for the language and mode - are present. - - lang (str): The language code. - mode (str): The lemmatizer mode. - lookups (Lookups): The provided lookups, may be None if the default - lookups should be loaded. - RETURNS (Lookups): The Lookups object. - - DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config - """ - config = cls.get_lookups_config(mode) - required_tables = config.get("required_tables", []) - optional_tables = config.get("optional_tables", []) - if lookups is None: - lookups = load_lookups(lang=lang, tables=required_tables) - optional_lookups = load_lookups( - lang=lang, tables=optional_tables, strict=False - ) - for table in optional_lookups.tables: - lookups.set_table(table, optional_lookups.get_table(table)) - for table in required_tables: - if table not in lookups: - raise ValueError( - Errors.E1004.format( - mode=mode, tables=required_tables, found=lookups.tables - ) - ) - return lookups + return (["lemma_rules"], ["lemma_exc", "lemma_index"]) + return ([], []) def __init__( self, @@ -107,7 +62,6 @@ class Lemmatizer(Pipe): name: str = "lemmatizer", *, mode: str = "lookup", - lookups: Optional[Lookups] = None, overwrite: bool = False, ) -> None: """Initialize a Lemmatizer. @@ -116,9 +70,6 @@ class Lemmatizer(Pipe): model (Model): A model (not yet implemented). name (str): The component name. Defaults to "lemmatizer". mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup". - lookups (Lookups): The lookups object containing the (optional) tables - such as "lemma_rules", "lemma_index", "lemma_exc" and - "lemma_lookup". Defaults to None overwrite (bool): Whether to overwrite existing lemmas. Defaults to `False`. @@ -128,8 +79,9 @@ class Lemmatizer(Pipe): self.model = model self.name = name self._mode = mode - self.lookups = lookups if lookups is not None else Lookups() + self.lookups = Lookups() self.overwrite = overwrite + self._validated = False if self.mode == "lookup": self.lemmatize = self.lookup_lemmatize elif self.mode == "rule": @@ -153,12 +105,56 @@ class Lemmatizer(Pipe): DOCS: https://nightly.spacy.io/api/lemmatizer#call """ + if not self._validated: + self._validate_tables(Errors.E1004) for token in doc: if self.overwrite or token.lemma == 0: token.lemma_ = self.lemmatize(token)[0] return doc - def pipe(self, stream, *, batch_size=128): + def initialize( + self, + get_examples: Optional[Callable[[], Iterable[Example]]] = None, + *, + nlp: Optional[Language] = None, + lookups: Optional[Lookups] = None, + ): + """Initialize the lemmatizer and load in data. + + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. + nlp (Language): The current nlp object the component is part of. + lookups (Lookups): The lookups object containing the (optional) tables + such as "lemma_rules", "lemma_index", "lemma_exc" and + "lemma_lookup". Defaults to None. + """ + required_tables, optional_tables = self.get_lookups_config(self.mode) + if lookups is None: + logger.debug("Lemmatizer: loading tables from spacy-lookups-data") + lookups = load_lookups(lang=self.vocab.lang, tables=required_tables) + optional_lookups = load_lookups( + lang=self.vocab.lang, tables=optional_tables, strict=False + ) + for table in optional_lookups.tables: + lookups.set_table(table, optional_lookups.get_table(table)) + self.lookups = lookups + self._validate_tables(Errors.E1004) + + def _validate_tables(self, error_message: str = Errors.E912) -> None: + """Check that the lookups are correct for the current mode.""" + required_tables, optional_tables = self.get_lookups_config(self.mode) + for table in required_tables: + if table not in self.lookups: + raise ValueError( + error_message.format( + mode=self.mode, + tables=required_tables, + found=self.lookups.tables, + ) + ) + self._validated = True + + def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are applied to the Doc. @@ -263,7 +259,7 @@ class Lemmatizer(Pipe): """ return False - def score(self, examples, **kwargs) -> Dict[str, Any]: + def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Score a batch of examples. examples (Iterable[Example]): The examples to score. @@ -274,58 +270,66 @@ class Lemmatizer(Pipe): validate_examples(examples, "Lemmatizer.score") return Scorer.score_token_attr(examples, "lemma", **kwargs) - def to_disk(self, path, *, exclude=tuple()): - """Save the current state to a directory. + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() + ): + """Serialize the pipe to disk. - path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. - exclude (list): String names of serialization fields to exclude. + path (str / Path): Path to a directory. + exclude (Iterable[str]): String names of serialization fields to exclude. - DOCS: https://nightly.spacy.io/api/vocab#to_disk + DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk """ serialize = {} serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["lookups"] = lambda p: self.lookups.to_disk(p) util.to_disk(path, serialize, exclude) - def from_disk(self, path, *, exclude=tuple()): - """Loads state from a directory. Modifies the object in place and - returns it. + def from_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() + ) -> "Lemmatizer": + """Load the pipe from disk. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. - exclude (list): String names of serialization fields to exclude. - RETURNS (Vocab): The modified `Vocab` object. + path (str / Path): Path to a directory. + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (Lemmatizer): The modified Lemmatizer object. - DOCS: https://nightly.spacy.io/api/vocab#to_disk + DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk """ deserialize = {} deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["lookups"] = lambda p: self.lookups.from_disk(p) util.from_disk(path, deserialize, exclude) + self._validate_tables() + return self - def to_bytes(self, *, exclude=tuple()) -> bytes: - """Serialize the current state to a binary string. + def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: + """Serialize the pipe to a bytestring. - exclude (list): String names of serialization fields to exclude. - RETURNS (bytes): The serialized form of the `Vocab` object. + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. - DOCS: https://nightly.spacy.io/api/vocab#to_bytes + DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes """ serialize = {} serialize["vocab"] = self.vocab.to_bytes serialize["lookups"] = self.lookups.to_bytes return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data: bytes, *, exclude=tuple()): - """Load state from a binary string. + def from_bytes( + self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList() + ) -> "Lemmatizer": + """Load the pipe from a bytestring. - bytes_data (bytes): The data to load from. - exclude (list): String names of serialization fields to exclude. - RETURNS (Vocab): The `Vocab` object. + bytes_data (bytes): The serialized pipe. + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (Lemmatizer): The loaded Lemmatizer. - DOCS: https://nightly.spacy.io/api/vocab#from_bytes + DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes """ deserialize = {} deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) deserialize["lookups"] = lambda b: self.lookups.from_bytes(b) util.from_bytes(bytes_data, deserialize, exclude) + self._validate_tables() + return self diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 5fee9a900..ab0554692 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,5 +1,5 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Optional +from typing import Optional, Union, Dict import srsly from thinc.api import SequenceCategoricalCrossentropy, Model, Config from itertools import islice @@ -101,6 +101,11 @@ class Morphologizer(Tagger): """RETURNS (Tuple[str]): The labels currently added to the component.""" return tuple(self.cfg["labels_morph"].keys()) + @property + def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]: + """A dictionary with all labels data.""" + return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]} + def add_label(self, label): """Add a new label to the pipe. @@ -129,27 +134,22 @@ class Morphologizer(Tagger): self.cfg["labels_pos"][norm_label] = POS_IDS[pos] return 1 - def begin_training(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, nlp=None): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. - DOCS: https://nightly.spacy.io/api/morphologizer#begin_training + DOCS: https://nightly.spacy.io/api/morphologizer#initialize """ self._ensure_examples(get_examples) # First, fetch all labels from the data for example in get_examples(): for i, token in enumerate(example.reference): pos = token.pos_ - morph = token.morph_ + morph = str(token.morph) # create and add the combined morph+POS label morph_dict = Morphology.feats_to_dict(morph) if pos: @@ -167,7 +167,7 @@ class Morphologizer(Tagger): gold_array = [] for i, token in enumerate(example.reference): pos = token.pos_ - morph = token.morph_ + morph = str(token.morph) morph_dict = Morphology.feats_to_dict(morph) if pos: morph_dict[self.POS_FEAT] = pos @@ -178,9 +178,6 @@ class Morphologizer(Tagger): assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - if sgd is None: - sgd = self.create_optimizer() - return sgd def set_annotations(self, docs, batch_tag_ids): """Modify a batch of documents, using pre-computed scores. diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 2f8940124..ba351f16e 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -81,7 +81,7 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids): pass - def begin_training(self, get_examples, pipeline=None, sgd=None): + def initialize(self, get_examples, nlp=None): if not hasattr(get_examples, "__call__"): err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples)) raise ValueError(err) @@ -91,9 +91,6 @@ class MultitaskObjective(Tagger): if label is not None and label not in self.labels: self.labels[label] = len(self.labels) self.model.initialize() # TODO: fix initialization by defining X and Y - if sgd is None: - sgd = self.create_optimizer() - return sgd def predict(self, docs): tokvecs = self.model.get_ref("tok2vec")(docs) @@ -177,13 +174,10 @@ class ClozeMultitask(Pipe): def set_annotations(self, docs, dep_ids): pass - def begin_training(self, get_examples, pipeline=None, sgd=None): + def initialize(self, get_examples, nlp=None): self.model.initialize() # TODO: fix initialization by defining X and Y X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) - self.model.output_layer.begin_training(X) - if sgd is None: - sgd = self.create_optimizer() - return sgd + self.model.output_layer.initialize(X) def predict(self, docs): tokvecs = self.model.get_ref("tok2vec")(docs) diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index fc0dda40d..6482d6125 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -96,14 +96,14 @@ cdef class EntityRecognizer(Parser): """Register another component as a multi-task objective. Experimental.""" self._multitasks.append(mt_component) - def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): + def init_multitask_objectives(self, get_examples, nlp=None, **cfg): """Setup multi-task objective components. Experimental and internal.""" # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: labeller.model.set_dim("nO", len(self.labels)) if labeller.model.has_ref("output_layer"): labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) - labeller.begin_training(get_examples, pipeline=pipeline) + labeller.initialize(get_examples, nlp=nlp) @property def labels(self): diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 324c8e19c..5316620e9 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True, profile=True +from typing import Optional, Tuple import srsly from thinc.api import set_dropout_rate, Model @@ -32,6 +33,17 @@ cdef class Pipe: self.name = name self.cfg = dict(cfg) + @property + def labels(self) -> Optional[Tuple[str]]: + return [] + + @property + def label_data(self): + """Optional JSON-serializable data that would be sufficient to recreate + the label set if provided to the `pipe.initialize()` method. + """ + return None + def __call__(self, Doc doc): """Apply the pipe to one document. The document is modified in place, and returned. This usually happens under the hood when the nlp object @@ -183,7 +195,7 @@ cdef class Pipe: """ return util.create_default_optimizer() - def begin_training(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, nlp=None): """Initialize the pipe for training, using data examples if available. This method needs to be implemented by each Pipe component, ensuring the internal model (if available) is initialized properly @@ -191,16 +203,11 @@ cdef class Pipe: get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. - DOCS: https://nightly.spacy.io/api/pipe#begin_training + DOCS: https://nightly.spacy.io/api/pipe#initialize """ - raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) + pass def _ensure_examples(self, get_examples): if get_examples is None or not hasattr(get_examples, "__call__"): diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 2882f6f8b..13fcd15e2 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -58,7 +58,7 @@ class Sentencizer(Pipe): else: self.punct_chars = set(self.default_punct_chars) - def begin_training(self, get_examples, pipeline=None, sgd=None): + def initialize(self, get_examples, nlp=None): pass def __call__(self, doc): diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index da85a9cf2..65c17c771 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger): # are 0 return tuple(["I", "S"]) + @property + def label_data(self): + return self.labels + def set_annotations(self, docs, batch_tag_ids): """Modify a batch of documents, using pre-computed scores. @@ -124,20 +128,15 @@ class SentenceRecognizer(Tagger): raise ValueError("nan value when computing loss") return float(loss), d_scores - def begin_training(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, nlp=None): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. - DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training + DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize """ self._ensure_examples(get_examples) doc_sample = [] @@ -151,9 +150,6 @@ class SentenceRecognizer(Tagger): assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - if sgd is None: - sgd = self.create_optimizer() - return sgd def add_label(self, label, values=None): raise NotImplementedError diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 3efe29916..37ad42b88 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -90,6 +90,11 @@ class Tagger(Pipe): """ return tuple(self.cfg["labels"]) + @property + def label_data(self): + """Data about the labels currently added to the component.""" + return tuple(self.cfg["labels"]) + def __call__(self, doc): """Apply the pipe to a Doc. @@ -256,31 +261,33 @@ class Tagger(Pipe): raise ValueError("nan value when computing loss") return float(loss), d_scores - def begin_training(self, get_examples, *, pipeline=None, sgd=None): + def initialize(self, get_examples, *, nlp=None, labels=None): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects.. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. + labels: The labels to add to the component, typically generated by the + `init labels` command. If no labels are provided, the get_examples + callback is used to extract the labels from the data. - DOCS: https://nightly.spacy.io/api/tagger#begin_training + DOCS: https://nightly.spacy.io/api/tagger#initialize """ self._ensure_examples(get_examples) + if labels is not None: + for tag in labels: + self.add_label(tag) + else: + tags = set() + for example in get_examples(): + for token in example.y: + if token.tag_: + tags.add(token.tag_) + for tag in sorted(tags): + self.add_label(tag) doc_sample = [] label_sample = [] - tags = set() - for example in get_examples(): - for token in example.y: - if token.tag_: - tags.add(token.tag_) - for tag in sorted(tags): - self.add_label(tag) for example in islice(get_examples(), 10): doc_sample.append(example.x) gold_tags = example.get_aligned("TAG", as_string=True) @@ -289,9 +296,6 @@ class Tagger(Pipe): assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - if sgd is None: - sgd = self.create_optimizer() - return sgd def add_label(self, label): """Add a new label to the pipe. diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 6b8c0ca65..a092d960f 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -154,8 +154,16 @@ class TextCategorizer(Pipe): @labels.setter def labels(self, value: List[str]) -> None: + # TODO: This really shouldn't be here. I had a look and I added it when + # I added the labels property, but it's pretty nasty to have this, and + # will lead to problems. self.cfg["labels"] = tuple(value) + @property + def label_data(self) -> List[str]: + """RETURNS (List[str]): Information about the component's labels.""" + return self.labels + def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are @@ -334,43 +342,40 @@ class TextCategorizer(Pipe): self.labels = tuple(list(self.labels) + [label]) return 1 - def begin_training( + def initialize( self, get_examples: Callable[[], Iterable[Example]], *, - pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, - sgd: Optional[Optimizer] = None, - ) -> Optimizer: + nlp: Optional[Language] = None, + labels: Optional[Dict] = None, + ): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. + labels: The labels to add to the component, typically generated by the + `init labels` command. If no labels are provided, the get_examples + callback is used to extract the labels from the data. - DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training + DOCS: https://nightly.spacy.io/api/textcategorizer#initialize """ self._ensure_examples(get_examples) - subbatch = [] # Select a subbatch of examples to initialize the model - for example in islice(get_examples(), 10): - if len(subbatch) < 2: - subbatch.append(example) - for cat in example.y.cats: - self.add_label(cat) + if labels is None: + for example in get_examples(): + for cat in example.y.cats: + self.add_label(cat) + else: + for label in labels: + self.add_label(label) + subbatch = list(islice(get_examples(), 10)) doc_sample = [eg.reference for eg in subbatch] label_sample, _ = self._examples_to_truth(subbatch) self._require_labels() assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - if sgd is None: - sgd = self.create_optimizer() - return sgd def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Score a batch of examples. diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 9ab4e42b7..89f9df757 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,4 +1,4 @@ -from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple +from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List from thinc.api import Model, set_dropout_rate, Optimizer, Config from itertools import islice @@ -203,26 +203,20 @@ class Tok2Vec(Pipe): def get_loss(self, examples, scores) -> None: pass - def begin_training( + def initialize( self, get_examples: Callable[[], Iterable[Example]], *, - pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, - sgd: Optional[Optimizer] = None, + nlp: Optional[Language] = None, ): """Initialize the pipe for training, using a representative set of data examples. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. - pipeline (List[Tuple[str, Callable]]): Optional list of pipeline - components that this component is part of. Corresponds to - nlp.pipeline. - sgd (thinc.api.Optimizer): Optional optimizer. Will be created with - create_optimizer if it doesn't exist. - RETURNS (thinc.api.Optimizer): The optimizer. + nlp (Language): The current nlp object the component is part of. - DOCS: https://nightly.spacy.io/api/tok2vec#begin_training + DOCS: https://nightly.spacy.io/api/tok2vec#initialize """ self._ensure_examples(get_examples) doc_sample = [] diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 1350e1f12..bcaa8e8d4 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -1,4 +1,4 @@ -# cython: infer_types=True, cdivision=True, boundscheck=False +# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True from __future__ import print_function from cymem.cymem cimport Pool cimport numpy as np @@ -7,6 +7,7 @@ from libcpp.vector cimport vector from libc.string cimport memset from libc.stdlib cimport calloc, free import random +from typing import Optional import srsly from thinc.api import set_dropout_rate @@ -95,6 +96,10 @@ cdef class Parser(Pipe): class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)] return class_names + @property + def label_data(self): + return self.moves.labels + @property def tok2vec(self): """Return the embedding and convolutional layer of the model.""" @@ -354,7 +359,7 @@ cdef class Parser(Pipe): # If all weights for an output are 0 in the original model, don't # supervise that output. This allows us to add classes. loss += (d_scores**2).sum() - backprop(d_scores, sgd=sgd) + backprop(d_scores) # Follow the predicted action self.transition_states(states, guesses) states = [state for state in states if not state.is_final()] @@ -405,18 +410,20 @@ cdef class Parser(Pipe): def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) - def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): + def initialize(self, get_examples, nlp=None, labels=None): self._ensure_examples(get_examples) - self.cfg.update(kwargs) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: langs = ", ".join(util.LEXEME_NORM_LANGS) util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs)) - actions = self.moves.get_actions( - examples=get_examples(), - min_freq=self.cfg['min_action_freq'], - learn_tokens=self.cfg["learn_tokens"] - ) + if labels is not None: + actions = dict(labels) + else: + actions = self.moves.get_actions( + examples=get_examples(), + min_freq=self.cfg['min_action_freq'], + learn_tokens=self.cfg["learn_tokens"] + ) for action, labels in self.moves.labels.items(): actions.setdefault(action, {}) for label, freq in labels.items(): @@ -425,11 +432,9 @@ cdef class Parser(Pipe): self.moves.initialize_actions(actions) # make sure we resize so we have an appropriate upper layer self._resize() - if sgd is None: - sgd = self.create_optimizer() doc_sample = [] - if pipeline is not None: - for name, component in pipeline: + if nlp is not None: + for name, component in nlp.pipeline: if component is self: break if hasattr(component, "pipe"): @@ -441,9 +446,8 @@ cdef class Parser(Pipe): doc_sample.append(example.predicted) assert len(doc_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(doc_sample) - if pipeline is not None: - self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) - return sgd + if nlp is not None: + self.init_multitask_objectives(get_examples, nlp.pipeline) def to_disk(self, path, exclude=tuple()): serializers = { diff --git a/spacy/schemas.py b/spacy/schemas.py index 7951b851b..591b7e134 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,15 +1,17 @@ from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Iterable, TypeVar, TYPE_CHECKING from enum import Enum -from pydantic import BaseModel, Field, ValidationError, validator +from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool -from pydantic import root_validator +from pydantic.main import ModelMetaclass +from thinc.api import Optimizer, ConfigValidationError from thinc.config import Promise from collections import defaultdict -from thinc.api import Optimizer +import inspect from .attrs import NAMES from .lookups import Lookups +from .util import is_cython_func if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports @@ -44,6 +46,96 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] +# Initialization + + +class ArgSchemaConfig: + extra = "forbid" + arbitrary_types_allowed = True + + +class ArgSchemaConfigExtra: + extra = "forbid" + arbitrary_types_allowed = True + + +def get_arg_model( + func: Callable, + *, + exclude: Iterable[str] = tuple(), + name: str = "ArgModel", + strict: bool = True, +) -> ModelMetaclass: + """Generate a pydantic model for function arguments. + + func (Callable): The function to generate the schema for. + exclude (Iterable[str]): Parameter names to ignore. + name (str): Name of created model class. + strict (bool): Don't allow extra arguments if no variable keyword arguments + are allowed on the function. + RETURNS (ModelMetaclass): A pydantic model. + """ + sig_args = {} + try: + sig = inspect.signature(func) + except ValueError: + # Typically happens if the method is part of a Cython module without + # binding=True. Here we just use an empty model that allows everything. + return create_model(name, __config__=ArgSchemaConfigExtra) + has_variable = False + for param in sig.parameters.values(): + if param.name in exclude: + continue + if param.kind == param.VAR_KEYWORD: + # The function allows variable keyword arguments so we shouldn't + # include **kwargs etc. in the schema and switch to non-strict + # mode and pass through all other values + has_variable = True + continue + # If no annotation is specified assume it's anything + annotation = param.annotation if param.annotation != param.empty else Any + # If no default value is specified assume that it's required. Cython + # functions/methods will have param.empty for default value None so we + # need to treat them differently + default_empty = None if is_cython_func(func) else ... + default = param.default if param.default != param.empty else default_empty + sig_args[param.name] = (annotation, default) + is_strict = strict and not has_variable + sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra + return create_model(name, **sig_args) + + +def validate_init_settings( + func: Callable, + settings: Dict[str, Any], + *, + section: Optional[str] = None, + name: str = "", + exclude: Iterable[str] = ("get_examples", "nlp"), +) -> Dict[str, Any]: + """Validate initialization settings against the expected arguments in + the method signature. Will parse values if possible (e.g. int to string) + and return the updated settings dict. Will raise a ConfigValidationError + if types don't match or required values are missing. + + func (Callable): The initialize method of a given component etc. + settings (Dict[str, Any]): The settings from the repsective [initialize] block. + section (str): Initialize section, for error message. + name (str): Name of the block in the section. + exclude (Iterable[str]): Parameter names to exclude from schema. + RETURNS (Dict[str, Any]): The validated settings. + """ + schema = get_arg_model(func, exclude=exclude, name="InitArgModel") + try: + return schema(**settings).dict() + except ValidationError as e: + block = "initialize" if not section else f"initialize.{section}" + title = f"Error validating initialization settings in [{block}]" + raise ConfigValidationError( + title=title, errors=e.errors(), config=settings, parent=name + ) from None + + # Matcher token patterns @@ -190,7 +282,7 @@ class ModelMetaSchema(BaseModel): sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources") vectors: Dict[str, Any] = Field({}, title="Included word vectors") labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name") - performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers") + performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers") spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used") # fmt: on @@ -205,8 +297,6 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off - vectors: Optional[StrictStr] = Field(..., title="Path to vectors") - lookups: Optional[Lookups] = Field(..., title="Vocab lookups") dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") @@ -219,8 +309,6 @@ class ConfigSchemaTraining(BaseModel): gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model") - init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") - raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") optimizer: Optimizer = Field(..., title="The optimizer to use") logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") @@ -273,36 +361,40 @@ class ConfigSchemaPretrain(BaseModel): arbitrary_types_allowed = True +class ConfigSchemaInit(BaseModel): + # fmt: off + vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") + lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") + vectors: Optional[StrictStr] = Field(..., title="Path to vectors") + init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") + tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") + components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component") + # fmt: on + + class Config: + extra = "forbid" + arbitrary_types_allowed = True + + class ConfigSchema(BaseModel): training: ConfigSchemaTraining nlp: ConfigSchemaNlp pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} components: Dict[str, Dict[str, Any]] corpora: Dict[str, Reader] - - @root_validator(allow_reuse=True) - def validate_config(cls, values): - """Perform additional validation for settings with dependencies.""" - pt = values.get("pretraining") - if pt and not isinstance(pt, ConfigSchemaPretrainEmpty): - if pt.objective.get("type") == "vectors" and not values["nlp"].vectors: - err = "Need nlp.vectors if pretraining.objective.type is vectors" - raise ValueError(err) - return values + initialize: ConfigSchemaInit class Config: extra = "allow" arbitrary_types_allowed = True -class TrainingSchema(BaseModel): - training: ConfigSchemaTraining - pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} - corpora: Dict[str, Reader] - - class Config: - extra = "allow" - arbitrary_types_allowed = True +CONFIG_SCHEMAS = { + "nlp": ConfigSchemaNlp, + "training": ConfigSchemaTraining, + "pretraining": ConfigSchemaPretrain, + "initialize": ConfigSchemaInit, +} # Project config Schema diff --git a/spacy/scorer.py b/spacy/scorer.py index b2f97e163..db32dabae 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -32,9 +32,7 @@ class PRFScore: def __add__(self, other): return PRFScore( - tp=self.tp+other.tp, - fp=self.fp+other.fp, - fn=self.fn+other.fn + tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn ) def score_set(self, cand: set, gold: set) -> None: @@ -485,7 +483,7 @@ class Scorer: (pred_ent.start_char, pred_ent.end_char), None ) label = gold_span.label_ - if not label in f_per_type: + if label not in f_per_type: f_per_type[label] = PRFScore() gold = gold_span.kb_id_ # only evaluating entities that overlap between gold and pred, @@ -632,7 +630,6 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: continue golds = {(e.label_, e.start, e.end) for e in eg.y.ents} align_x2y = eg.alignment.x2y - preds = set() for pred_ent in eg.x.ents: if pred_ent.label_ not in scores: scores[pred_ent.label_] = PRFScore() diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 3a9a1f26b..bcf582388 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -272,22 +272,35 @@ def zh_tokenizer_char(): def zh_tokenizer_jieba(): pytest.importorskip("jieba") config = { - "@tokenizers": "spacy.zh.ChineseTokenizer", - "segmenter": "jieba", + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.zh.ChineseTokenizer", + "segmenter": "jieba", + } + } } - nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) + nlp = get_lang_class("zh").from_config(config) return nlp.tokenizer @pytest.fixture(scope="session") def zh_tokenizer_pkuseg(): pytest.importorskip("pkuseg") + pytest.importorskip("pickle5") config = { - "@tokenizers": "spacy.zh.ChineseTokenizer", - "segmenter": "pkuseg", - "pkuseg_model": "default", + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.zh.ChineseTokenizer", + "segmenter": "pkuseg", + } + }, + "initialize": {"tokenizer": { + "pkuseg_model": "default", + } + }, } - nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}}) + nlp = get_lang_class("zh").from_config(config) + nlp.initialize() return nlp.tokenizer diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 86aa883bd..fa0206fdd 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training(lambda: [_ner_example(ner)]) + ner.initialize(lambda: [_ner_example(ner)]) ner(doc) doc.ents = [("ANIMAL", 3, 4)] @@ -48,7 +48,7 @@ def test_ents_reset(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training(lambda: [_ner_example(ner)]) + ner.initialize(lambda: [_ner_example(ner)]) ner(doc) orig_iobs = [t.ent_iob_ for t in doc] doc.ents = list(doc.ents) diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 9c050f740..ef54c581c 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab): words = ["Eat", "blue", "ham"] morph = ["Feat=V", "Feat=J", "Feat=N"] doc = Doc(en_vocab, words=words, morphs=morph) - assert morph[0] == doc[0].morph_ - assert morph[1] == doc[1].morph_ - assert morph[2] == doc[2].morph_ + assert morph[0] == str(doc[0].morph) + assert morph[1] == str(doc[1].morph) + assert morph[2] == str(doc[2].morph) feats_array = doc.to_array((ORTH, MORPH)) assert feats_array[0][1] == doc[0].morph.key diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index e5e72fe2a..e3e056685 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -19,7 +19,7 @@ def test_doc_api_init(en_vocab): assert [t.is_sent_start for t in doc] == [True, False, True, False] # heads override sent_starts doc = Doc( - en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4, + en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4 ) assert [t.is_sent_start for t in doc] == [True, False, True, False] @@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab): words = ["I", "live", "in", "New", "York", "."] morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"] # fmt: on - doc = Doc(en_vocab, words=words) - for i, morph in enumerate(morphs): - doc[i].morph_ = morph + doc = Doc(en_vocab, words=words, morphs=morphs) attrs = [MORPH] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) - assert [t.morph_ for t in new_doc] == morphs - assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc] + assert [str(t.morph) for t in new_doc] == morphs + assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc] def test_doc_api_from_docs(en_tokenizer, de_tokenizer): @@ -423,7 +421,7 @@ def test_has_annotation(en_vocab): doc[0].tag_ = "A" doc[0].pos_ = "X" - doc[0].morph_ = "Feat=Val" + doc[0].set_morph("Feat=Val") doc[0].lemma_ = "a" doc[0].dep_ = "dep" doc[0].head = doc[1] @@ -435,7 +433,7 @@ def test_has_annotation(en_vocab): doc[1].tag_ = "A" doc[1].pos_ = "X" - doc[1].morph_ = "" + doc[1].set_morph("") doc[1].lemma_ = "a" doc[1].dep_ = "dep" doc.ents = [Span(doc, 0, 2, label="HELLO")] @@ -533,5 +531,78 @@ def test_doc_ents_setter(): assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] vocab = Vocab() ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)] + ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"] doc = Doc(vocab, words=words, ents=ents) assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] + + +def test_doc_morph_setter(en_tokenizer, de_tokenizer): + doc1 = en_tokenizer("a b") + doc1b = en_tokenizer("c d") + doc2 = de_tokenizer("a b") + + # unset values can be copied + doc1[0].morph = doc1[1].morph + assert doc1[0].morph.key == 0 + assert doc1[1].morph.key == 0 + + # morph values from the same vocab can be copied + doc1[0].set_morph("Feat=Val") + doc1[1].morph = doc1[0].morph + assert doc1[0].morph == doc1[1].morph + + # ... also across docs + doc1b[0].morph = doc1[0].morph + assert doc1[0].morph == doc1b[0].morph + + doc2[0].set_morph("Feat2=Val2") + + # the morph value must come from the same vocab + with pytest.raises(ValueError): + doc1[0].morph = doc2[0].morph + + +def test_doc_init_iob(): + """Test ents validation/normalization in Doc.__init__""" + words = ["a", "b", "c", "d", "e"] + ents = ["O"] * len(words) + doc = Doc(Vocab(), words=words, ents=ents) + assert doc.ents == () + + ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"] + doc = Doc(Vocab(), words=words, ents=ents) + assert len(doc.ents) == 2 + + ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"] + doc = Doc(Vocab(), words=words, ents=ents) + assert len(doc.ents) == 3 + + # None is missing + ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"] + doc = Doc(Vocab(), words=words, ents=ents) + assert len(doc.ents) == 2 + + # empty tag is missing + ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"] + doc = Doc(Vocab(), words=words, ents=ents) + assert len(doc.ents) == 2 + + # invalid IOB + ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"] + with pytest.raises(ValueError): + doc = Doc(Vocab(), words=words, ents=ents) + + # no dash + ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"] + with pytest.raises(ValueError): + doc = Doc(Vocab(), words=words, ents=ents) + + # no ent type + ents = ["O", "B-", "O", "I-PERSON", "I-GPE"] + with pytest.raises(ValueError): + doc = Doc(Vocab(), words=words, ents=ents) + + # not strings or None + ents = [0, "B-", "O", "I-PERSON", "I-GPE"] + with pytest.raises(ValueError): + doc = Doc(Vocab(), words=words, ents=ents) diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index f378ce042..b44b13d4c 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -4,13 +4,13 @@ import pytest @pytest.fixture def i_has(en_tokenizer): doc = en_tokenizer("I has") - doc[0].morph_ = {"PronType": "prs"} - doc[1].morph_ = { + doc[0].set_morph({"PronType": "prs"}) + doc[1].set_morph({ "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": "three", - } + }) return doc @@ -47,20 +47,20 @@ def test_morph_get(i_has): def test_morph_set(i_has): assert i_has[0].morph.get("PronType") == ["prs"] # set by string - i_has[0].morph_ = "PronType=unk" + i_has[0].set_morph("PronType=unk") assert i_has[0].morph.get("PronType") == ["unk"] # set by string, fields are alphabetized - i_has[0].morph_ = "PronType=123|NounType=unk" - assert i_has[0].morph_ == "NounType=unk|PronType=123" + i_has[0].set_morph("PronType=123|NounType=unk") + assert str(i_has[0].morph) == "NounType=unk|PronType=123" # set by dict - i_has[0].morph_ = {"AType": "123", "BType": "unk"} - assert i_has[0].morph_ == "AType=123|BType=unk" + i_has[0].set_morph({"AType": "123", "BType": "unk"}) + assert str(i_has[0].morph) == "AType=123|BType=unk" # set by string with multiple values, fields and values are alphabetized - i_has[0].morph_ = "BType=c|AType=b,a" - assert i_has[0].morph_ == "AType=a,b|BType=c" + i_has[0].set_morph("BType=c|AType=b,a") + assert str(i_has[0].morph) == "AType=a,b|BType=c" # set by dict with multiple values, fields and values are alphabetized - i_has[0].morph_ = {"AType": "b,a", "BType": "c"} - assert i_has[0].morph_ == "AType=a,b|BType=c" + i_has[0].set_morph({"AType": "b,a", "BType": "c"}) + assert str(i_has[0].morph) == "AType=a,b|BType=c" def test_morph_str(i_has): @@ -72,25 +72,25 @@ def test_morph_property(tokenizer): doc = tokenizer("a dog") # set through token.morph_ - doc[0].morph_ = "PronType=prs" - assert doc[0].morph_ == "PronType=prs" + doc[0].set_morph("PronType=prs") + assert str(doc[0].morph) == "PronType=prs" assert doc.to_array(["MORPH"])[0] != 0 # unset with token.morph - doc[0].morph = 0 + doc[0].set_morph(None) assert doc.to_array(["MORPH"])[0] == 0 # empty morph is equivalent to "_" - doc[0].morph_ = "" - assert doc[0].morph_ == "" + doc[0].set_morph("") + assert str(doc[0].morph) == "" assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] # "_" morph is also equivalent to empty morph - doc[0].morph_ = "_" - assert doc[0].morph_ == "" + doc[0].set_morph("_") + assert str(doc[0].morph) == "" assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] # set through existing hash with token.morph tokenizer.vocab.strings.add("Feat=Val") - doc[0].morph = tokenizer.vocab.strings.add("Feat=Val") - assert doc[0].morph_ == "Feat=Val" + doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val")) + assert str(doc[0].morph) == "Feat=Val" diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 806c4b46f..cb886545a 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer): assert doc[4].text == "the beach boys" assert doc[4].text_with_ws == "the beach boys " assert doc[4].tag_ == "NAMED" - assert doc[4].morph_ == "Number=Plur" + assert str(doc[4].morph) == "Number=Plur" assert doc[5].text == "all night" assert doc[5].text_with_ws == "all night" assert doc[5].tag_ == "NAMED" - assert doc[5].morph_ == "Number=Plur" + assert str(doc[5].morph) == "Number=Plur" def test_doc_retokenize_merge_children(en_tokenizer): @@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer): heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15] tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"] ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)] + ents = ["O"] * len(heads) + ents[0] = "B-PERSON" + ents[1] = "I-PERSON" + ents[10] = "B-GPE" + ents[13] = "B-PERSON" + ents[14] = "I-PERSON" # fmt: on tokens = en_tokenizer(text) doc = Doc( @@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab): # if there is a parse, span.root provides default values words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] heads = [0, 0, 3, 0, 0, 0, 5, 0, 0] - ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)] + ents = ["O"] * len(words) + ents[3] = "B-ent-de" + ents[4] = "I-ent-de" + ents[5] = "B-ent-fg" + ents[6] = "I-ent-fg" deps = ["dep"] * len(words) en_vocab.strings.add("ent-de") en_vocab.strings.add("ent-fg") @@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab): # check that B is preserved if span[start] is B words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] heads = [0, 0, 3, 4, 0, 0, 5, 0, 0] - ents = [("ent-de", 3, 5), ("ent-de", 5, 7)] + ents = ["O"] * len(words) + ents[3] = "B-ent-de" + ents[4] = "I-ent-de" + ents[5] = "B-ent-de" + ents[6] = "I-ent-de" deps = ["dep"] * len(words) doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) with doc.retokenize() as retokenizer: diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 4d4b170f9..238e36d59 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab): assert doc[0].text == "Los" assert doc[0].head.text == "Angeles" assert doc[0].idx == 0 - assert doc[0].morph_ == "Number=Sing" + assert str(doc[0].morph) == "Number=Sing" assert doc[1].idx == 3 assert doc[1].text == "Angeles" assert doc[1].head.text == "start" - assert doc[1].morph_ == "Number=Sing" + assert str(doc[1].morph) == "Number=Sing" assert doc[2].text == "start" assert doc[2].head.text == "." assert doc[3].text == "." diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index c9bcafcfa..9abe5779d 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -9,7 +9,7 @@ def doc(en_vocab): tags = ["VBP", "NN", "NN"] heads = [0, 0, 0] deps = ["ROOT", "dobj", "dobj"] - ents = [("ORG", 1, 2)] + ents = ["O", "B-ORG", "O"] return Doc( en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents ) diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py index 0ed12d208..7b8b15b1c 100644 --- a/spacy/tests/lang/de/test_noun_chunks.py +++ b/spacy/tests/lang/de/test_noun_chunks.py @@ -2,8 +2,7 @@ import pytest def test_noun_chunks_is_parsed_de(de_tokenizer): - """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed. - """ + """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.""" doc = de_tokenizer("Er lag auf seinem") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py index 2d376c612..2684a5cfb 100644 --- a/spacy/tests/lang/el/test_noun_chunks.py +++ b/spacy/tests/lang/el/test_noun_chunks.py @@ -2,8 +2,7 @@ import pytest def test_noun_chunks_is_parsed_el(el_tokenizer): - """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed. - """ + """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.""" doc = el_tokenizer("είναι χώρα της νοτιοανατολικής") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index 0189a26d4..540f3ed84 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -7,8 +7,7 @@ import pytest def test_noun_chunks_is_parsed(en_tokenizer): - """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed. - """ + """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.""" doc = en_tokenizer("This is a sentence") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py index db89fd903..e5afd81c9 100644 --- a/spacy/tests/lang/es/test_noun_chunks.py +++ b/spacy/tests/lang/es/test_noun_chunks.py @@ -2,8 +2,7 @@ import pytest def test_noun_chunks_is_parsed_es(es_tokenizer): - """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed. - """ + """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.""" doc = es_tokenizer("en Oxford este verano") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py index 53b39d9a1..d2411e6d3 100644 --- a/spacy/tests/lang/fa/test_noun_chunks.py +++ b/spacy/tests/lang/fa/test_noun_chunks.py @@ -2,8 +2,7 @@ import pytest def test_noun_chunks_is_parsed_fa(fa_tokenizer): - """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed. - """ + """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.""" doc = fa_tokenizer("این یک جمله نمونه می باشد.") with pytest.raises(ValueError): diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py index 77e72a76b..d75c653d0 100644 --- a/spacy/tests/lang/fr/test_exceptions.py +++ b/spacy/tests/lang/fr/test_exceptions.py @@ -36,9 +36,7 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text): assert len(tokens) == 1 -@pytest.mark.parametrize( - "text", ["janv.", "juill.", "Dr.", "av.", "sept."], -) +@pytest.mark.parametrize("text", ["janv.", "juill.", "Dr.", "av.", "sept."]) def test_fr_tokenizer_handles_abbr(fr_tokenizer, text): tokens = fr_tokenizer(text) assert len(tokens) == 1 diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index d81199a3e..48ac88ead 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -2,8 +2,7 @@ import pytest def test_noun_chunks_is_parsed_fr(fr_tokenizer): - """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed. - """ + """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.""" doc = fr_tokenizer("trouver des travaux antérieurs") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py index fef1524f1..a39456581 100644 --- a/spacy/tests/lang/id/test_noun_chunks.py +++ b/spacy/tests/lang/id/test_noun_chunks.py @@ -2,8 +2,7 @@ import pytest def test_noun_chunks_is_parsed_id(id_tokenizer): - """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed. - """ + """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.""" doc = id_tokenizer("sebelas") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index e52741b70..c8c85d655 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -112,7 +112,7 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): @pytest.mark.parametrize( - "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS, + "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS ) def test_ja_tokenizer_sub_tokens( ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py index 9965fcd14..dd259f2b7 100644 --- a/spacy/tests/lang/nb/test_noun_chunks.py +++ b/spacy/tests/lang/nb/test_noun_chunks.py @@ -2,8 +2,7 @@ import pytest def test_noun_chunks_is_parsed_nb(nb_tokenizer): - """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed. - """ + """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.""" doc = nb_tokenizer("Smørsausen brukes bl.a. til") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/ne/test_text.py b/spacy/tests/lang/ne/test_text.py index 7dd971132..e8a6c2e98 100644 --- a/spacy/tests/lang/ne/test_text.py +++ b/spacy/tests/lang/ne/test_text.py @@ -8,7 +8,7 @@ def test_ne_tokenizer_handlers_long_text(ne_tokenizer): @pytest.mark.parametrize( - "text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)], + "text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)] ) def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length): tokens = ne_tokenizer(text) diff --git a/spacy/tests/lang/sa/test_text.py b/spacy/tests/lang/sa/test_text.py index 41257a4d8..daa8d20c0 100644 --- a/spacy/tests/lang/sa/test_text.py +++ b/spacy/tests/lang/sa/test_text.py @@ -10,7 +10,7 @@ def test_sa_tokenizer_handles_long_text(sa_tokenizer): @pytest.mark.parametrize( "text,length", [ - ("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9,), + ("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9), ("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6), ], ) diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py index 3791d8021..d2410156c 100644 --- a/spacy/tests/lang/sv/test_noun_chunks.py +++ b/spacy/tests/lang/sv/test_noun_chunks.py @@ -3,8 +3,7 @@ from spacy.tokens import Doc def test_noun_chunks_is_parsed_sv(sv_tokenizer): - """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed. - """ + """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.""" doc = sv_tokenizer("Studenten läste den bästa boken") with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py index 6e7f82341..5f45664eb 100644 --- a/spacy/tests/lang/test_lemmatizers.py +++ b/spacy/tests/lang/test_lemmatizers.py @@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd): @registry.misc("lemmatizer_init_lookups") def lemmatizer_init_lookups(): lookups = Lookups() - lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups - """Test that languages can be initialized.""" + # Test that languages can be initialized nlp = get_lang_class(lang)() - nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}}) + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) + assert not lemmatizer.lookups.tables + nlp.config["initialize"]["components"]["lemmatizer"] = { + "lookups": {"@misc": "lemmatizer_init_lookups"} + } + with pytest.raises(ValueError): + nlp("x") + nlp.initialize() + assert lemmatizer.lookups.tables + doc = nlp("x") # Check for stray print statements (see #3342) - doc = nlp("test") # noqa: F841 captured = capfd.readouterr() assert not captured.out + assert doc[0].lemma_ == "y" + + # Test initialization by calling .initialize() directly + nlp = get_lang_class(lang)() + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) + lemmatizer.initialize(lookups=lemmatizer_init_lookups()) + assert nlp("x")[0].lemma_ == "y" diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index 1c6fdf419..58c084ec8 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -27,9 +27,18 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba): @pytest.mark.slow def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): - nlp = Chinese( - meta={ - "tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}} - } - ) + config = { + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.zh.ChineseTokenizer", + "segmenter": "pkuseg", + } + }, + "initialize": {"tokenizer": { + "pkuseg_model": "medicine", + } + }, + } + nlp = Chinese.from_config(config) + nlp.initialize() zh_tokenizer_serialize(nlp.tokenizer) diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 627110cdd..77b09f376 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab): matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 3 - doc[0].morph_ = "Feat=Val" + doc[0].set_morph("Feat=Val") assert len(matcher(doc)) == 3 - doc[0].morph_ = "Feat=Val|Feat2=Val2" + doc[0].set_morph("Feat=Val|Feat2=Val2") assert len(matcher(doc)) == 3 - doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") assert len(matcher(doc)) == 2 - doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") assert len(matcher(doc)) == 2 # IS_SUBSET acts like "IN" for attrs other than MORPH @@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab): matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 0 - doc[0].morph_ = "Feat=Val|Feat2=Val2" + doc[0].set_morph("Feat=Val|Feat2=Val2") assert len(matcher(doc)) == 0 - doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") assert len(matcher(doc)) == 1 - doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") assert len(matcher(doc)) == 1 # IS_SUPERSET with more than one value only matches for MORPH @@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab): doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 0 - doc[0].morph_ = "Feat2=Val2|Feat1=Val1" + doc[0].set_morph("Feat2=Val2|Feat1=Val1") assert len(matcher(doc)) == 2 - doc[0].morph_ = "Feat1=Val1|Feat2=Val2" + doc[0].set_morph("Feat1=Val1|Feat2=Val2") assert len(matcher(doc)) == 2 # multiple values are split @@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab): doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 0 - doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1" + doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1") assert len(matcher(doc)) == 1 - doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2" + doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2") assert len(matcher(doc)) == 2 @@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab): doc2 = Doc(en_vocab, words=["Test"]) doc2[0].tag_ = "TAG" doc2[0].pos_ = "X" - doc2[0].morph_ = "Feat=Val" + doc2[0].set_morph("Feat=Val") doc2[0].lemma_ = "LEMMA" doc3 = Doc(en_vocab, words=["Test"]) # DEP requires DEP diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 522356ffc..1b81fd780 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab): doc2 = Doc(en_vocab, words=["Test"]) doc2[0].tag_ = "TAG" doc2[0].pos_ = "X" - doc2[0].morph_ = "Feat=Val" + doc2[0].set_morph("Feat=Val") doc3 = Doc(en_vocab, words=["Test"]) matcher = PhraseMatcher(en_vocab, validate=True) with pytest.warns(UserWarning): @@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab): doc2 = Doc(en_vocab, words=["Test"]) doc2[0].tag_ = "TAG" doc2[0].pos_ = "X" - doc2[0].morph_ = "Feat=Val" + doc2[0].set_morph("Feat=Val") doc2[0].lemma_ = "LEMMA" doc3 = Doc(en_vocab, words=["Test"]) # DEP requires DEP diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index cd376e0fc..2f750b60c 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -35,7 +35,7 @@ def test_init_parser(parser): def _train_parser(parser): fix_random_seed(1) parser.add_label("left") - parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) + parser.initialize(lambda: [_parser_example(parser)]) sgd = Adam(0.001) for i in range(5): @@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly(): ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") - ner1.begin_training(lambda: [_ner_example(ner1)]) + ner1.initialize(lambda: [_ner_example(ner1)]) ner2 = EntityRecognizer(Vocab(), model, **config) # the second model needs to be resized before we can call from_bytes diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index cd5581769..b657ae2e8 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -202,7 +202,7 @@ def test_train_empty(): train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) ner = nlp.add_pipe("ner", last=True) ner.add_label("PERSON") - nlp.begin_training() + nlp.initialize() for itn in range(2): losses = {} batches = util.minibatch(train_examples, size=8) @@ -213,7 +213,7 @@ def test_train_empty(): def test_overwrite_token(): nlp = English() nlp.add_pipe("ner") - nlp.begin_training() + nlp.initialize() # The untrained NER will predict O for each token doc = nlp("I live in New York") assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"] @@ -235,7 +235,7 @@ def test_empty_ner(): nlp = English() ner = nlp.add_pipe("ner") ner.add_label("MY_LABEL") - nlp.begin_training() + nlp.initialize() doc = nlp("John is watching the news about Croatia's elections") # if this goes wrong, the initialization of the parser's upper layer is probably broken result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"] @@ -254,7 +254,7 @@ def test_ruler_before_ner(): # 2: untrained NER - should set everything else to O untrained_ner = nlp.add_pipe("ner") untrained_ner.add_label("MY_LABEL") - nlp.begin_training() + nlp.initialize() doc = nlp("This is Antti Korhonen speaking in Finland") expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] expected_types = ["THING", "", "", "", "", "", ""] @@ -269,7 +269,7 @@ def test_ner_before_ruler(): # 1: untrained NER - should set everything to O untrained_ner = nlp.add_pipe("ner", name="uner") untrained_ner.add_label("MY_LABEL") - nlp.begin_training() + nlp.initialize() # 2 : Entity Ruler - should set "this" to B and keep everything else O patterns = [{"label": "THING", "pattern": "This"}] @@ -290,7 +290,7 @@ def test_block_ner(): nlp.add_pipe("blocker", config={"start": 2, "end": 5}) untrained_ner = nlp.add_pipe("ner") untrained_ner.add_label("MY_LABEL") - nlp.begin_training() + nlp.initialize() doc = nlp("This is Antti L Korhonen speaking in Finland") expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"] expected_types = ["", "", "", "", "", "", "", ""] @@ -307,7 +307,7 @@ def test_overfitting_IO(): train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for ent in annotations.get("entities"): ner.add_label(ent[2]) - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(50): losses = {} @@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog): assert not len(nlp.vocab.lookups) nlp.add_pipe("ner") with caplog.at_level(logging.DEBUG): - nlp.begin_training() + nlp.initialize() assert "W033" in caplog.text caplog.clear() nlp.vocab.lookups.add_table("lexeme_norm") nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with caplog.at_level(logging.DEBUG): - nlp.begin_training() + nlp.initialize() assert "W033" not in caplog.text @@ -358,5 +358,5 @@ class BlockerComponent1: self.name = name def __call__(self, doc): - doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified") + doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified") return doc diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 8648f2018..ffb6f23f1 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -191,7 +191,7 @@ def test_overfitting_IO(): train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for dep in annotations.get("deps", []): parser.add_label(dep) - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(100): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index e8dfa68c7..ab58ac17b 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -34,7 +34,7 @@ def parser(vocab): parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") - parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) + parser.initialize(lambda: [_parser_example(parser)]) sgd = Adam(0.001) for i in range(10): diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index b9e5894dd..5773127af 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts): a.add(**p) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" - assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" - assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert str(doc[3].morph) == "Case=Nom|Number=Sing" assert doc.has_annotation("LEMMA") assert doc.has_annotation("MORPH") @@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" - assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" - assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert str(doc[3].morph) == "Case=Nom|Number=Sing" assert doc.has_annotation("LEMMA") assert doc.has_annotation("MORPH") nlp.remove_pipe("attribute_ruler") @@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): ) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" - assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" - assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert str(doc[3].morph) == "Case=Nom|Number=Sing" assert doc.has_annotation("LEMMA") assert doc.has_annotation("MORPH") @@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts): nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" - assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" - assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert str(doc[3].morph) == "Case=Nom|Number=Sing" dev_examples = [ Example.from_dict( @@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map): for i in range(len(doc)): if i == 4: assert doc[i].pos_ == "PUNCT" - assert doc[i].morph_ == "PunctType=peri" + assert str(doc[i].morph) == "PunctType=peri" else: assert doc[i].pos_ == "" - assert doc[i].morph_ == "" + assert str(doc[i].morph) == "" def test_attributeruler_morph_rules(nlp, morph_rules): @@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules): for i in range(len(doc)): if i != 2: assert doc[i].pos_ == "" - assert doc[i].morph_ == "" + assert str(doc[i].morph) == "" else: assert doc[2].pos_ == "DET" assert doc[2].lemma_ == "a" - assert doc[2].morph_ == "Case=Nom" + assert str(doc[2].morph) == "Case=Nom" def test_attributeruler_indices(nlp): @@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp): for i in range(len(doc)): if i == 1: assert doc[i].lemma_ == "was" - assert doc[i].morph_ == "Case=Nom|Number=Sing" + assert str(doc[i].morph) == "Case=Nom|Number=Sing" elif i == 2: assert doc[i].lemma_ == "the" - assert doc[i].morph_ == "Case=Nom|Number=Plur" + assert str(doc[i].morph) == "Case=Nom|Number=Plur" elif i == 3: assert doc[i].lemma_ == "cat" else: - assert doc[i].morph_ == "" + assert str(doc[i].morph) == "" # raises an error when trying to modify a token outside of the match a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2) with pytest.raises(ValueError): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 878f41a28..66de54c06 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -134,7 +134,7 @@ def test_kb_undefined(nlp): """Test that the EL can't train without defining a KB""" entity_linker = nlp.add_pipe("entity_linker", config={}) with pytest.raises(ValueError): - entity_linker.begin_training(lambda: []) + entity_linker.initialize(lambda: []) def test_kb_empty(nlp): @@ -143,7 +143,7 @@ def test_kb_empty(nlp): entity_linker = nlp.add_pipe("entity_linker", config=config) assert len(entity_linker.kb) == 0 with pytest.raises(ValueError): - entity_linker.begin_training(lambda: []) + entity_linker.initialize(lambda: []) def test_kb_serialize(nlp): @@ -254,14 +254,12 @@ def test_vocab_serialization(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) + mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) - q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) + mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases - douglas_hash = mykb.add_alias( - alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1] - ) + mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) candidates = mykb.get_alias_candidates("adam") @@ -360,7 +358,7 @@ def test_preserving_links_asdoc(nlp): ruler.add_patterns(patterns) el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False} entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True) - nlp.begin_training() + nlp.initialize() assert entity_linker.model.get_dim("nO") == vector_length # test whether the entity links are preserved by the `as_doc()` function @@ -463,7 +461,7 @@ def test_overfitting_IO(): ) # train the NEL pipe - optimizer = nlp.begin_training(get_examples=lambda: train_examples) + optimizer = nlp.initialize(get_examples=lambda: train_examples) assert entity_linker.model.get_dim("nO") == vector_length assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py new file mode 100644 index 000000000..c9b514770 --- /dev/null +++ b/spacy/tests/pipeline/test_initialize.py @@ -0,0 +1,69 @@ +import pytest +from spacy.language import Language +from spacy.lang.en import English +from spacy.training import Example +from thinc.api import ConfigValidationError +from pydantic import StrictBool + + +def test_initialize_arguments(): + name = "test_initialize_arguments" + + class CustomTokenizer: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + self.from_initialize = None + + def __call__(self, text): + return self.tokenizer(text) + + def initialize(self, get_examples, nlp, custom: int): + self.from_initialize = custom + + class Component: + def __init__(self): + self.from_initialize = None + + def initialize( + self, get_examples, nlp, custom1: str, custom2: StrictBool = False + ): + self.from_initialize = (custom1, custom2) + + Language.factory(name, func=lambda nlp, name: Component()) + + nlp = English() + nlp.tokenizer = CustomTokenizer(nlp.tokenizer) + example = Example.from_dict(nlp("x"), {}) + get_examples = lambda: [example] + nlp.add_pipe(name) + # The settings here will typically come from the [initialize] block + init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}} + nlp.config["initialize"].update(init_cfg) + with pytest.raises(ConfigValidationError) as e: + # Empty config for component, no required custom1 argument + nlp.initialize(get_examples) + errors = e.value.errors + assert len(errors) == 1 + assert errors[0]["loc"] == ("custom1",) + assert errors[0]["type"] == "value_error.missing" + init_cfg = { + "tokenizer": {"custom": 1}, + "components": {name: {"custom1": "x", "custom2": 1}}, + } + nlp.config["initialize"].update(init_cfg) + with pytest.raises(ConfigValidationError) as e: + # Wrong type of custom 2 + nlp.initialize(get_examples) + errors = e.value.errors + assert len(errors) == 1 + assert errors[0]["loc"] == ("custom2",) + assert errors[0]["type"] == "value_error.strictbool" + init_cfg = { + "tokenizer": {"custom": 1}, + "components": {name: {"custom1": "x"}}, + } + nlp.config["initialize"].update(init_cfg) + nlp.initialize(get_examples) + assert nlp.tokenizer.from_initialize == 1 + pipe = nlp.get_pipe(name) + assert pipe.from_initialize == ("x", False) diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index 05e15bc16..d37c87059 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -8,61 +8,52 @@ from ..util import make_tempdir @pytest.fixture def nlp(): - return English() - - -@pytest.fixture -def lemmatizer(nlp): @registry.misc("cope_lookups") def cope_lookups(): lookups = Lookups() - lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups - lemmatizer = nlp.add_pipe( - "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}} - ) - return lemmatizer + nlp = English() + nlp.config["initialize"]["components"]["lemmatizer"] = { + "lookups": {"@misc": "cope_lookups"} + } + return nlp def test_lemmatizer_init(nlp): - @registry.misc("cope_lookups") - def cope_lookups(): - lookups = Lookups() - lookups.add_table("lemma_lookup", {"cope": "cope"}) - lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) - lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) - lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) - return lookups - - lemmatizer = nlp.add_pipe( - "lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}} - ) + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) assert isinstance(lemmatizer.lookups, Lookups) + assert not lemmatizer.lookups.tables assert lemmatizer.mode == "lookup" + with pytest.raises(ValueError): + nlp("test") + nlp.initialize() + assert lemmatizer.lookups.tables + assert nlp("cope")[0].lemma_ == "cope" + assert nlp("coped")[0].lemma_ == "cope" # replace any tables from spacy-lookups-data lemmatizer.lookups = Lookups() - doc = nlp("coping") # lookup with no tables sets text as lemma - assert doc[0].lemma_ == "coping" - + assert nlp("cope")[0].lemma_ == "cope" + assert nlp("coped")[0].lemma_ == "coped" nlp.remove_pipe("lemmatizer") - - @registry.misc("empty_lookups") - def empty_lookups(): - return Lookups() - + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) with pytest.raises(ValueError): - nlp.add_pipe( - "lemmatizer", - config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}}, - ) + # Can't initialize without required tables + lemmatizer.initialize(lookups=Lookups()) + lookups = Lookups() + lookups.add_table("lemma_lookup", {}) + lemmatizer.initialize(lookups=lookups) -def test_lemmatizer_config(nlp, lemmatizer): +def test_lemmatizer_config(nlp): + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"}) + nlp.initialize() + doc = nlp.make_doc("coping") doc[0].pos_ = "VERB" assert doc[0].lemma_ == "" @@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer): assert doc[0].lemma_ == "cope" -def test_lemmatizer_serialize(nlp, lemmatizer): - @registry.misc("cope_lookups") +def test_lemmatizer_serialize(nlp): + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"}) + nlp.initialize() + def cope_lookups(): lookups = Lookups() - lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups nlp2 = English() - lemmatizer2 = nlp2.add_pipe( - "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}} - ) + lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"}) + lemmatizer2.initialize(lookups=cope_lookups()) lemmatizer2.from_bytes(lemmatizer.to_bytes()) assert lemmatizer.to_bytes() == lemmatizer2.to_bytes() assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables @@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer): with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) - doc2 = nlp2.make_doc("coping") - doc2[0].pos_ = "VERB" - assert doc2[0].lemma_ == "" - doc2 = lemmatizer(doc2) - assert doc2[0].text == "coping" - assert doc2[0].lemma_ == "cope" + doc2 = nlp2.make_doc("coping") + doc2[0].pos_ = "VERB" + assert doc2[0].lemma_ == "" + doc2 = lemmatizer(doc2) + assert doc2[0].text == "coping" + assert doc2[0].lemma_ == "cope" diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 864c7332e..af81129c0 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -33,7 +33,7 @@ def test_no_label(): nlp = Language() nlp.add_pipe("morphologizer") with pytest.raises(ValueError): - nlp.begin_training() + nlp.initialize() def test_implicit_label(): @@ -42,7 +42,7 @@ def test_implicit_label(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - nlp.begin_training(get_examples=lambda: train_examples) + nlp.initialize(get_examples=lambda: train_examples) def test_no_resize(): @@ -50,13 +50,13 @@ def test_no_resize(): morphologizer = nlp.add_pipe("morphologizer") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB") - nlp.begin_training() + nlp.initialize() # this throws an error because the morphologizer can't be resized after initialization with pytest.raises(ValueError): morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ") -def test_begin_training_examples(): +def test_initialize_examples(): nlp = Language() morphologizer = nlp.add_pipe("morphologizer") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") @@ -64,12 +64,12 @@ def test_begin_training_examples(): for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine - nlp.begin_training() - nlp.begin_training(get_examples=lambda: train_examples) - with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: None) + nlp.initialize() + nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(ValueError): - nlp.begin_training(get_examples=train_examples) + nlp.initialize(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.initialize(get_examples=train_examples) def test_overfitting_IO(): @@ -79,7 +79,7 @@ def test_overfitting_IO(): train_examples = [] for inst in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) - optimizer = nlp.begin_training(get_examples=lambda: train_examples) + optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} @@ -91,7 +91,7 @@ def test_overfitting_IO(): doc = nlp(test_text) gold_morphs = ["Feat=N", "Feat=V", "", ""] gold_pos_tags = ["NOUN", "VERB", "ADJ", ""] - assert [t.morph_ for t in doc] == gold_morphs + assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags # Also test the results are still the same after IO @@ -99,5 +99,5 @@ def test_overfitting_IO(): nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) - assert [t.morph_ for t in doc2] == gold_morphs + assert [str(t.morph) for t in doc2] == gold_morphs assert [t.pos_ for t in doc2] == gold_pos_tags diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 5827f8ff1..c64dfcbd6 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -31,19 +31,19 @@ TRAIN_DATA = [ ] -def test_begin_training_examples(): +def test_initialize_examples(): nlp = Language() nlp.add_pipe("senter") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine - nlp.begin_training() - nlp.begin_training(get_examples=lambda: train_examples) - with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: None) + nlp.initialize() + nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(ValueError): - nlp.begin_training(get_examples=train_examples) + nlp.initialize(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.initialize(get_examples=train_examples) def test_overfitting_IO(): @@ -58,7 +58,7 @@ def test_overfitting_IO(): train_examples[1].reference[11].is_sent_start = False nlp.add_pipe("senter") - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(200): losses = {} diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index cd5927675..b32925d84 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -15,14 +15,14 @@ def test_label_types(): tagger.add_label(9) -def test_tagger_begin_training_tag_map(): - """Test that Tagger.begin_training() without gold tuples does not clobber +def test_tagger_initialize_tag_map(): + """Test that Tagger.initialize() without gold tuples does not clobber the tag map.""" nlp = Language() tagger = nlp.add_pipe("tagger") orig_tag_count = len(tagger.labels) tagger.add_label("A") - nlp.begin_training() + nlp.initialize() assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels) @@ -38,7 +38,7 @@ def test_no_label(): nlp = Language() nlp.add_pipe("tagger") with pytest.raises(ValueError): - nlp.begin_training() + nlp.initialize() def test_no_resize(): @@ -47,7 +47,7 @@ def test_no_resize(): tagger.add_label("N") tagger.add_label("V") assert tagger.labels == ("N", "V") - nlp.begin_training() + nlp.initialize() assert tagger.model.get_dim("nO") == 2 # this throws an error because the tagger can't be resized after initialization with pytest.raises(ValueError): @@ -60,10 +60,10 @@ def test_implicit_label(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - nlp.begin_training(get_examples=lambda: train_examples) + nlp.initialize(get_examples=lambda: train_examples) -def test_begin_training_examples(): +def test_initialize_examples(): nlp = Language() tagger = nlp.add_pipe("tagger") train_examples = [] @@ -72,16 +72,16 @@ def test_begin_training_examples(): for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine - nlp.begin_training() - nlp.begin_training(get_examples=lambda: train_examples) - with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: None) - with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: train_examples[0]) + nlp.initialize() + nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(ValueError): - nlp.begin_training(get_examples=lambda: []) + nlp.initialize(get_examples=lambda: None) + with pytest.raises(TypeError): + nlp.initialize(get_examples=lambda: train_examples[0]) with pytest.raises(ValueError): - nlp.begin_training(get_examples=train_examples) + nlp.initialize(get_examples=lambda: []) + with pytest.raises(ValueError): + nlp.initialize(get_examples=train_examples) def test_overfitting_IO(): @@ -91,7 +91,7 @@ def test_overfitting_IO(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - optimizer = nlp.begin_training(get_examples=lambda: train_examples) + optimizer = nlp.initialize(get_examples=lambda: train_examples) assert tagger.model.get_dim("nO") == len(TAGS) for i in range(50): @@ -122,4 +122,4 @@ def test_tagger_requires_labels(): nlp = English() nlp.add_pipe("tagger") with pytest.raises(ValueError): - nlp.begin_training() + nlp.initialize() diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 232b53e1d..e0a785851 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer from spacy.tokens import Doc from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer +from spacy.training import Example +from spacy.training.initialize import verify_textcat_config from ..util import make_tempdir -from ...cli.train import verify_textcat_config -from ...training import Example TRAIN_DATA = [ @@ -26,7 +26,7 @@ def test_simple_train(): nlp = Language() textcat = nlp.add_pipe("textcat") textcat.add_label("answer") - nlp.begin_training() + nlp.initialize() for i in range(5): for text, answer in [ ("aaaa", 1.0), @@ -56,7 +56,7 @@ def test_textcat_learns_multilabel(): textcat = TextCategorizer(nlp.vocab, width=8) for letter in letters: textcat.add_label(letter) - optimizer = textcat.begin_training(lambda: []) + optimizer = textcat.initialize(lambda: []) for i in range(30): losses = {} examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs] @@ -86,7 +86,7 @@ def test_no_label(): nlp = Language() nlp.add_pipe("textcat") with pytest.raises(ValueError): - nlp.begin_training() + nlp.initialize() def test_implicit_label(): @@ -95,7 +95,7 @@ def test_implicit_label(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - nlp.begin_training(get_examples=lambda: train_examples) + nlp.initialize(get_examples=lambda: train_examples) def test_no_resize(): @@ -103,14 +103,14 @@ def test_no_resize(): textcat = nlp.add_pipe("textcat") textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") - nlp.begin_training() + nlp.initialize() assert textcat.model.get_dim("nO") == 2 # this throws an error because the textcat can't be resized after initialization with pytest.raises(ValueError): textcat.add_label("NEUTRAL") -def test_begin_training_examples(): +def test_initialize_examples(): nlp = Language() textcat = nlp.add_pipe("textcat") train_examples = [] @@ -119,12 +119,12 @@ def test_begin_training_examples(): for label, value in annotations.get("cats").items(): textcat.add_label(label) # you shouldn't really call this more than once, but for testing it should be fine - nlp.begin_training() - nlp.begin_training(get_examples=lambda: train_examples) - with pytest.raises(TypeError): - nlp.begin_training(get_examples=lambda: None) + nlp.initialize() + nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(ValueError): - nlp.begin_training(get_examples=train_examples) + nlp.initialize(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.initialize(get_examples=train_examples) def test_overfitting_IO(): @@ -139,7 +139,7 @@ def test_overfitting_IO(): train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - optimizer = nlp.begin_training(get_examples=lambda: train_examples) + optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.get_dim("nO") == 2 for i in range(50): @@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config): train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for label, value in annotations.get("cats").items(): textcat.add_label(label) - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(5): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) @@ -226,6 +226,7 @@ def test_positive_class_not_binary(): with pytest.raises(ValueError): verify_textcat_config(nlp, pipe_config) + def test_textcat_evaluation(): train_examples = [] nlp = English() @@ -241,15 +242,17 @@ def test_textcat_evaluation(): pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0} train_examples.append(Example(pred2, ref2)) - scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]) - assert scores["cats_f_per_type"]["winter"]["p"] == 1/2 - assert scores["cats_f_per_type"]["winter"]["r"] == 1/1 + scores = Scorer().score_cats( + train_examples, "cats", labels=["winter", "summer", "spring", "autumn"] + ) + assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2 + assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1 assert scores["cats_f_per_type"]["summer"]["p"] == 0 - assert scores["cats_f_per_type"]["summer"]["r"] == 0/1 - assert scores["cats_f_per_type"]["spring"]["p"] == 1/1 - assert scores["cats_f_per_type"]["spring"]["r"] == 1/2 - assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2 - assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2 + assert scores["cats_f_per_type"]["summer"]["r"] == 0 / 1 + assert scores["cats_f_per_type"]["spring"]["p"] == 1 / 1 + assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 2 + assert scores["cats_f_per_type"]["autumn"]["p"] == 2 / 2 + assert scores["cats_f_per_type"]["autumn"]["r"] == 2 / 2 - assert scores["cats_micro_p"] == 4/5 - assert scores["cats_micro_r"] == 4/6 + assert scores["cats_micro_p"] == 4 / 5 + assert scores["cats_micro_r"] == 4 / 6 diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 558b9079c..06212e351 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -73,8 +73,7 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co encode_config["width"] = width docs = get_batch(3) tok2vec = build_Tok2Vec_model( - embed_arch(**embed_config), - encode_arch(**encode_config) + embed_arch(**embed_config), encode_arch(**encode_config) ) tok2vec.initialize(docs) vectors, backprop = tok2vec.begin_update(docs) @@ -88,7 +87,7 @@ def test_init_tok2vec(): nlp = English() tok2vec = nlp.add_pipe("tok2vec") assert tok2vec.listeners == [] - nlp.begin_training() + nlp.initialize() assert tok2vec.model.get_dim("nO") @@ -154,7 +153,7 @@ def test_tok2vec_listener(): # Check that the Tok2Vec component finds it listeners assert tok2vec.listeners == [] - optimizer = nlp.begin_training(lambda: train_examples) + optimizer = nlp.initialize(lambda: train_examples) assert tok2vec.listeners == [tagger_tok2vec] for i in range(5): diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index d841ee24b..6bb71f6f4 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -428,7 +428,7 @@ def test_issue999(): for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) - nlp.begin_training() + nlp.initialize() for itn in range(20): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index dce3e8298..f85ec70e1 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -250,7 +250,7 @@ def test_issue1915(): ner = nlp.add_pipe("ner") ner.add_label("answer") with pytest.raises(ValueError): - nlp.begin_training(**cfg) + nlp.initialize(**cfg) def test_issue1945(): diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index c4c755153..09baab4d8 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -30,7 +30,7 @@ def test_issue2179(): nlp = Italian() ner = nlp.add_pipe("ner") ner.add_label("CITIZENSHIP") - nlp.begin_training() + nlp.initialize() nlp2 = Italian() nlp2.add_pipe("ner") assert len(nlp2.get_pipe("ner").labels) == 0 diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 5895b616e..4952a545d 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -18,7 +18,7 @@ def test_issue2564(): nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("A") - nlp.begin_training() + nlp.initialize() doc = nlp("hello world") assert doc.has_annotation("TAG") docs = nlp.pipe(["hello", "world"]) @@ -149,7 +149,7 @@ def test_issue2800(): ner = nlp.add_pipe("ner") for entity_type in list(entity_types): ner.add_label(entity_type) - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(20): losses = {} random.shuffle(train_data) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 56ef23dbf..01f58ae77 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -59,7 +59,7 @@ def test_issue3012(en_vocab): words = ["This", "is", "10", "%", "."] tags = ["DT", "VBZ", "CD", "NN", "."] pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] - ents = [("PERCENT", 2, 4)] + ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"] doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) assert doc.has_annotation("TAG") expected = ("10", "NUM", "CD", "PERCENT") @@ -92,7 +92,7 @@ def test_issue3209(): nlp = English() ner = nlp.add_pipe("ner") ner.add_label("ANIMAL") - nlp.begin_training() + nlp.initialize() move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] assert ner.move_names == move_names nlp2 = English() @@ -239,7 +239,7 @@ def test_issue3456(): nlp = English() tagger = nlp.add_pipe("tagger") tagger.add_label("A") - nlp.begin_training() + nlp.initialize() list(nlp.pipe(["hi", ""])) diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index 304e654c3..0505571c2 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -223,15 +223,13 @@ def test_issue3611(): textcat.add_label(label) # training the network with nlp.select_pipes(enable="textcat"): - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(3): losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - nlp.update( - examples=batch, sgd=optimizer, drop=0.1, losses=losses, - ) + nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) def test_issue3625(): @@ -268,7 +266,7 @@ def test_issue3830_no_subtok(): parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels - parser.begin_training(lambda: [_parser_example(parser)]) + parser.initialize(lambda: [_parser_example(parser)]) assert "subtok" not in parser.labels @@ -283,7 +281,7 @@ def test_issue3830_with_subtok(): parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels - parser.begin_training(lambda: [_parser_example(parser)]) + parser.initialize(lambda: [_parser_example(parser)]) assert "subtok" in parser.labels @@ -342,7 +340,7 @@ def test_issue3880(): nlp.add_pipe("parser").add_label("dep") nlp.add_pipe("ner").add_label("PERSON") nlp.add_pipe("tagger").add_label("NN") - nlp.begin_training() + nlp.initialize() for doc in nlp.pipe(texts): pass @@ -390,7 +388,7 @@ def test_issue3959(): def test_issue3962(en_vocab): - """ Ensure that as_doc does not result in out-of-bound access of tokens. + """Ensure that as_doc does not result in out-of-bound access of tokens. This is achieved by setting the head to itself if it would lie out of the span otherwise.""" # fmt: off words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] @@ -428,7 +426,7 @@ def test_issue3962(en_vocab): def test_issue3962_long(en_vocab): - """ Ensure that as_doc does not result in out-of-bound access of tokens. + """Ensure that as_doc does not result in out-of-bound access of tokens. This is achieved by setting the head to itself if it would lie out of the span otherwise.""" # fmt: off words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] @@ -463,8 +461,7 @@ def test_issue3962_long(en_vocab): def test_issue3972(en_vocab): - """Test that the PhraseMatcher returns duplicates for duplicate match IDs. - """ + """Test that the PhraseMatcher returns duplicates for duplicate match IDs.""" matcher = PhraseMatcher(en_vocab) matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 7b7ddfe0d..0e2579ac4 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -19,8 +19,7 @@ from ..util import make_tempdir def test_issue4002(en_vocab): - """Test that the PhraseMatcher can match on overwritten NORM attributes. - """ + """Test that the PhraseMatcher can match on overwritten NORM attributes.""" matcher = PhraseMatcher(en_vocab, attr="NORM") pattern1 = Doc(en_vocab, words=["c", "d"]) assert [t.norm_ for t in pattern1] == ["c", "d"] @@ -66,15 +65,13 @@ def test_issue4030(): textcat.add_label(label) # training the network with nlp.select_pipes(enable="textcat"): - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(3): losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - nlp.update( - examples=batch, sgd=optimizer, drop=0.1, losses=losses, - ) + nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) # processing of an empty doc should result in 0.0 for all categories doc = nlp("") assert doc.cats["offensive"] == 0.0 @@ -87,7 +84,7 @@ def test_issue4042(): # add ner pipe ner = nlp.add_pipe("ner") ner.add_label("SOME_LABEL") - nlp.begin_training() + nlp.initialize() # Add entity ruler patterns = [ {"label": "MY_ORG", "pattern": "Apple"}, @@ -118,7 +115,7 @@ def test_issue4042_bug2(): # add ner pipe ner1 = nlp1.add_pipe("ner") ner1.add_label("SOME_LABEL") - nlp1.begin_training() + nlp1.initialize() # add a new label to the doc doc1 = nlp1("What do you think about Apple ?") assert len(ner1.labels) == 1 @@ -244,7 +241,7 @@ def test_issue4267(): nlp = English() ner = nlp.add_pipe("ner") ner.add_label("PEOPLE") - nlp.begin_training() + nlp.initialize() assert "ner" in nlp.pipe_names # assert that we have correct IOB annotations doc1 = nlp("hi") @@ -299,7 +296,7 @@ def test_issue4313(): config = {} ner = nlp.create_pipe("ner", config=config) ner.add_label("SOME_LABEL") - ner.begin_training(lambda: []) + ner.initialize(lambda: []) # add a new label to the doc doc = nlp("What do you think about Apple ?") assert len(ner.labels) == 1 @@ -327,7 +324,7 @@ def test_issue4348(): TRAIN_DATA = [example, example] tagger = nlp.add_pipe("tagger") tagger.add_label("A") - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(5): losses = {} batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index e351858f5..6dbbc233b 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -180,7 +180,7 @@ def test_issue4725_2(): vocab.set_vector("dog", data[1]) nlp = English(vocab=vocab) nlp.add_pipe("ner") - nlp.begin_training() + nlp.initialize() docs = ["Kurt is in London."] * 10 for _ in nlp.pipe(docs, batch_size=2, n_process=2): pass diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 531e48ec3..5e320996a 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -64,7 +64,7 @@ def tagger(): # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization tagger.add_label("A") - nlp.begin_training() + nlp.initialize() return tagger @@ -85,7 +85,7 @@ def entity_linker(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - nlp.begin_training() + nlp.initialize() return entity_linker diff --git a/spacy/tests/regression/test_issue5551.py b/spacy/tests/regression/test_issue5551.py index b7139d463..655764362 100644 --- a/spacy/tests/regression/test_issue5551.py +++ b/spacy/tests/regression/test_issue5551.py @@ -25,7 +25,7 @@ def test_issue5551(): pipe = nlp.add_pipe(component, config=pipe_cfg, last=True) for label in set(example[1]["cats"]): pipe.add_label(label) - nlp.begin_training() + nlp.initialize() # Store the result of each iteration result = pipe.model.predict([nlp.make_doc(example[0])]) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index eb5f15007..da048f3d6 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -14,8 +14,8 @@ from ..util import make_tempdir nlp_config_string = """ [paths] -train = "" -dev = "" +train = null +dev = null [corpora] @@ -152,7 +152,7 @@ def test_serialize_nlp(): nlp_config = Config().from_str(nlp_config_string) nlp = load_model_from_config(nlp_config, auto_fill=True) nlp.get_pipe("tagger").add_label("A") - nlp.begin_training() + nlp.initialize() assert "tok2vec" in nlp.pipe_names assert "tagger" in nlp.pipe_names assert "parser" not in nlp.pipe_names @@ -173,7 +173,7 @@ def test_serialize_custom_nlp(): parser_cfg = dict() parser_cfg["model"] = {"@architectures": "my_test_parser"} nlp.add_pipe("parser", config=parser_cfg) - nlp.begin_training() + nlp.initialize() with make_tempdir() as d: nlp.to_disk(d) @@ -191,7 +191,7 @@ def test_serialize_parser(): model_config = Config().from_str(parser_config_string) parser = nlp.add_pipe("parser", config=model_config) parser.add_label("nsubj") - nlp.begin_training() + nlp.initialize() with make_tempdir() as d: nlp.to_disk(d) @@ -309,7 +309,7 @@ def test_config_interpolation(): config = Config().from_str(nlp_config_string, interpolate=False) assert config["corpora"]["train"]["path"] == "${paths.train}" interpolated = config.interpolate() - assert interpolated["corpora"]["train"]["path"] == "" + assert interpolated["corpora"]["train"]["path"] is None nlp = English.from_config(config) assert nlp.config["corpora"]["train"]["path"] == "${paths.train}" # Ensure that variables are preserved in nlp config @@ -317,10 +317,10 @@ def test_config_interpolation(): assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width interpolated2 = nlp.config.interpolate() - assert interpolated2["corpora"]["train"]["path"] == "" + assert interpolated2["corpora"]["train"]["path"] is None assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 nlp2 = English.from_config(interpolated) - assert nlp2.config["corpora"]["train"]["path"] == "" + assert nlp2.config["corpora"]["train"]["path"] is None assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 4a976fc02..8b6adb83b 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -1,3 +1,6 @@ +import pytest +from spacy.tokens.doc import Underscore + import spacy from spacy.lang.en import English from spacy.tokens import Doc, DocBin @@ -86,3 +89,20 @@ def test_serialize_doc_bin_unknown_spaces(en_vocab): assert re_doc1.text == "that 's " assert not re_doc2.has_unknown_spaces assert re_doc2.text == "that's" + + +@pytest.mark.parametrize( + "writer_flag,reader_flag,reader_value", [(True, True, "bar"), (True, False, "bar"), (False, True, "nothing"), (False, False, "nothing")] +) +def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value): + """Test that custom extensions are correctly serialized in DocBin.""" + Doc.set_extension("foo", default="nothing") + doc = Doc(en_vocab, words=["hello", "world"]) + doc._.foo = "bar" + doc_bin_1 = DocBin(store_user_data=writer_flag) + doc_bin_1.add(doc) + doc_bin_bytes = doc_bin_1.to_bytes() + doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes) + doc_2 = list(doc_bin_2.get_docs(en_vocab))[0] + assert doc_2._.foo == reader_value + Underscore.doc_extensions = {} diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index caf4ea890..62584d0ce 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -3,12 +3,12 @@ from click import NoSuchOption from spacy.training import docs_to_json, offsets_to_biluo_tags from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate +from spacy.util import ENV_VARS from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables -from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR -from spacy.cli.debug_config import check_section_refs -from thinc.api import ConfigValidationError, Config +from spacy.cli._util import string_to_list +from thinc.api import ConfigValidationError import srsly import os @@ -343,21 +343,22 @@ def test_parse_config_overrides_invalid_2(args): def test_parse_cli_overrides(): - os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello" + overrides = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello" + os.environ[ENV_VARS.CONFIG_OVERRIDES] = overrides result = parse_config_overrides([]) assert len(result) == 4 assert result["x.foo"] == "bar" assert result["x.bar"] == 12 assert result["x.baz"] is False assert result["y.foo"] == "hello" - os.environ[OVERRIDES_ENV_VAR] = "--x" + os.environ[ENV_VARS.CONFIG_OVERRIDES] = "--x" assert parse_config_overrides([], env_var=None) == {} with pytest.raises(SystemExit): parse_config_overrides([]) - os.environ[OVERRIDES_ENV_VAR] = "hello world" + os.environ[ENV_VARS.CONFIG_OVERRIDES] = "hello world" with pytest.raises(SystemExit): parse_config_overrides([]) - del os.environ[OVERRIDES_ENV_VAR] + del os.environ[ENV_VARS.CONFIG_OVERRIDES] @pytest.mark.parametrize("lang", ["en", "nl"]) @@ -414,15 +415,3 @@ def test_string_to_list(value): def test_string_to_list_intify(value): assert string_to_list(value, intify=False) == ["1", "2", "3"] assert string_to_list(value, intify=True) == [1, 2, 3] - - -def test_check_section_refs(): - config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}} - config = Config(config) - # Valid section reference - check_section_refs(config, ["a.b.c"]) - # Section that doesn't exist in this config - check_section_refs(config, ["x.y.z"]) - # Invalid section reference - with pytest.raises(ConfigValidationError): - check_section_refs(config, ["a.b.c", "f.g"]) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index da46ad424..917e7552e 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -18,7 +18,7 @@ def nlp(): textcat = nlp.add_pipe("textcat") for label in ("POSITIVE", "NEGATIVE"): textcat.add_label(label) - nlp.begin_training() + nlp.initialize() return nlp @@ -290,9 +290,7 @@ def test_spacy_blank(): assert nlp.meta["name"] == "my_custom_model" -@pytest.mark.parametrize( - "value", [False, None, ["x", "y"], Language, Vocab], -) +@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab]) def test_language_init_invalid_vocab(value): err_fragment = "invalid value" with pytest.raises(ValueError) as e: diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 4e079d29e..e6ef45f90 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -7,7 +7,6 @@ from spacy import util from spacy import prefer_gpu, require_gpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding -from thinc.api import Optimizer @pytest.fixture @@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected): result = util.dot_to_dict(dot_notation) assert result == expected assert util.dict_to_dot(result) == dot_notation - - -def test_resolve_training_config(): - config = { - "nlp": {"lang": "en", "disabled": []}, - "training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}}, - "corpora": {}, - } - resolved = util.resolve_training_config(config) - assert resolved["training"]["dropout"] == 0.1 - assert isinstance(resolved["training"]["optimizer"], Optimizer) - assert resolved["corpora"] == {} - assert "nlp" not in resolved diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 8f1bb1c3d..a123f459d 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -64,7 +64,7 @@ def get_tok2vec_kwargs(): width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False ), "encode": MaxoutWindowEncoder( - width=32, depth=2, maxout_pieces=2, window_size=1, + width=32, depth=2, maxout_pieces=2, window_size=1 ), } diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 2825f1703..039f3d4d8 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -76,7 +76,7 @@ def tagged_doc(): for i in range(len(tags)): doc[i].tag_ = tags[i] doc[i].pos_ = pos[i] - doc[i].morph_ = morphs[i] + doc[i].set_morph(morphs[i]) if i > 0: doc[i].is_sent_start = False return doc @@ -137,7 +137,7 @@ def test_las_per_type(en_vocab): examples = [] for input_, annot in test_las_apple: doc = Doc( - en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"], + en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"] ) gold = {"heads": annot["heads"], "deps": annot["deps"]} example = Example.from_dict(doc, gold) @@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab): doc = Doc( en_vocab, words=input_.split(" "), - ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)], + ents=["B-CARDINAL", "O", "B-CARDINAL"], ) entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) @@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab): doc = Doc( en_vocab, words=input_.split(" "), - ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)], + ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"], ) entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) @@ -242,7 +242,7 @@ def test_tag_score(tagged_doc): gold = { "tags": [t.tag_ for t in tagged_doc], "pos": [t.pos_ for t in tagged_doc], - "morphs": [t.morph_ for t in tagged_doc], + "morphs": [str(t.morph) for t in tagged_doc], "sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc], } example = Example.from_dict(tagged_doc, gold) @@ -259,7 +259,7 @@ def test_tag_score(tagged_doc): tags[0] = "NN" pos = [t.pos_ for t in tagged_doc] pos[1] = "X" - morphs = [t.morph_ for t in tagged_doc] + morphs = [str(t.morph) for t in tagged_doc] morphs[1] = "Number=sing" morphs[2] = "Number=plur" gold = { diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 0647b8556..f710a38eb 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -1,14 +1,15 @@ import pytest -from .util import get_random_doc - from spacy import util from spacy.util import dot_to_object, SimpleFrozenList -from thinc.api import Config, Optimizer +from thinc.api import Config, Optimizer, ConfigValidationError from spacy.training.batchers import minibatch_by_words -from ..lang.en import English -from ..lang.nl import Dutch -from ..language import DEFAULT_CONFIG_PATH +from spacy.lang.en import English +from spacy.lang.nl import Dutch +from spacy.language import DEFAULT_CONFIG_PATH +from spacy.schemas import ConfigSchemaTraining + +from .util import get_random_doc @pytest.mark.parametrize( @@ -101,8 +102,8 @@ def test_util_dot_section(): dot_to_object(en_nlp.config, "nlp.pipeline.tagger") with pytest.raises(KeyError): dot_to_object(en_nlp.config, "nlp.unknownattribute") - resolved = util.resolve_training_config(nl_nlp.config) - assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer) + T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining) + assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer) def test_simple_frozen_list(): @@ -120,3 +121,17 @@ def test_simple_frozen_list(): t = SimpleFrozenList(["foo", "bar"], error="Error!") with pytest.raises(NotImplementedError): t.append("baz") + + +def test_resolve_dot_names(): + config = { + "training": {"optimizer": {"@optimizers": "Adam.v1"}}, + "foo": {"bar": "training.optimizer", "baz": "training.xyz"}, + } + result = util.resolve_dot_names(config, ["training.optimizer"]) + assert isinstance(result[0], Optimizer) + with pytest.raises(ConfigValidationError) as e: + util.resolve_dot_names(config, ["training.xyz", "training.optimizer"]) + errors = e.value.errors + assert len(errors) == 1 + assert errors[0]["loc"] == ["training", "xyz"] diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index 81207b640..06db86a12 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -113,7 +113,7 @@ def test_Example_from_dict_with_morphology(annots): predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) for i, token in enumerate(example.reference): - assert token.morph_ == annots["morphs"][i] + assert str(token.morph) == annots["morphs"][i] @pytest.mark.parametrize( diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index c06c9d282..9d82ca50a 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable import pytest from thinc.api import Config from spacy import Language -from spacy.util import load_model_from_config, registry, dot_to_object -from spacy.util import resolve_training_config +from spacy.util import load_model_from_config, registry, resolve_dot_names +from spacy.schemas import ConfigSchemaTraining from spacy.training import Example @@ -39,21 +39,24 @@ def test_readers(): config = Config().from_str(config_string) nlp = load_model_from_config(config, auto_fill=True) - resolved = resolve_training_config(nlp.config) - train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"]) + T = registry.resolve( + nlp.config.interpolate()["training"], schema=ConfigSchemaTraining + ) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) assert isinstance(train_corpus, Callable) - optimizer = resolved["training"]["optimizer"] + optimizer = T["optimizer"] # simulate a training loop - nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): nlp.update([example], sgd=optimizer) - dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"]) scores = nlp.evaluate(list(dev_corpus(nlp))) assert scores["cats_score"] # ensure the pipeline runs doc = nlp("Quick test") assert doc.cats - extra_corpus = resolved["corpora"]["extra"] + corpora = {"corpora": nlp.config.interpolate()["corpora"]} + extra_corpus = registry.resolve(corpora)["corpora"]["extra"] assert isinstance(extra_corpus, Callable) @@ -89,18 +92,20 @@ def test_cat_readers(reader, additional_config): config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) - resolved = resolve_training_config(nlp.config) - train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"]) - optimizer = resolved["training"]["optimizer"] + T = registry.resolve( + nlp.config["training"].interpolate(), schema=ConfigSchemaTraining + ) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) + optimizer = T["optimizer"] # simulate a training loop - nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) for example in train_corpus(nlp): assert example.y.cats # this shouldn't fail if each training example has at least one positive label assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] nlp.update([example], sgd=optimizer) # simulate performance benchmark on dev corpus - dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"]) dev_examples = list(dev_corpus(nlp)) for example in dev_examples: # this shouldn't fail if each dev example has at least one positive label diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 5311fae1e..7d41c8908 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -11,6 +11,7 @@ from spacy.util import get_words_and_spaces, minibatch from thinc.api import compounding import pytest import srsly +import random from ..util import make_tempdir @@ -29,7 +30,12 @@ def doc(en_vocab): heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."] - ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9)) + ents = ["O"] * len(words) + ents[0] = "B-PERSON" + ents[1] = "I-PERSON" + ents[5] = "B-LOC" + ents[6] = "I-LOC" + ents[8] = "B-GPE" cats = {"TRAVEL": 1.0, "BAKING": 0.0} # fmt: on doc = Doc( @@ -454,7 +460,7 @@ def test_roundtrip_docs_to_docbin(doc): idx = [t.idx for t in doc] tags = [t.tag_ for t in doc] pos = [t.pos_ for t in doc] - morphs = [t.morph_ for t in doc] + morphs = [str(t.morph) for t in doc] lemmas = [t.lemma_ for t in doc] deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] @@ -476,7 +482,7 @@ def test_roundtrip_docs_to_docbin(doc): assert idx == [t.idx for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference] assert pos == [t.pos_ for t in reloaded_example.reference] - assert morphs == [t.morph_ for t in reloaded_example.reference] + assert morphs == [str(t.morph) for t in reloaded_example.reference] assert lemmas == [t.lemma_ for t in reloaded_example.reference] assert deps == [t.dep_ for t in reloaded_example.reference] assert heads == [t.head.i for t in reloaded_example.reference] @@ -492,12 +498,54 @@ def test_roundtrip_docs_to_docbin(doc): @pytest.mark.filterwarnings("ignore::UserWarning") def test_make_orth_variants(doc): nlp = English() + orth_variants = { + "single": [ + {"tags": ["NFP"], "variants": ["…", "..."]}, + {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, + ] + } + augmenter = create_orth_variants_augmenter( + level=0.2, lower=0.5, orth_variants=orth_variants + ) with make_tempdir() as tmpdir: output_file = tmpdir / "roundtrip.spacy" DocBin(docs=[doc]).to_disk(output_file) # due to randomness, test only that this runs with no errors for now - reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)) - train_examples = list(reader(nlp)) + reader = Corpus(output_file, augmenter=augmenter) + list(reader(nlp)) + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_custom_data_augmentation(doc): + def create_spongebob_augmenter(randomize: bool = False): + def augment(nlp, example): + text = example.text + if randomize: + ch = [c.lower() if random.random() < 0.5 else c.upper() for c in text] + else: + ch = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)] + example_dict = example.to_dict() + doc = nlp.make_doc("".join(ch)) + example_dict["token_annotation"]["ORTH"] = [t.text for t in doc] + yield example + yield example.from_dict(doc, example_dict) + + return augment + + nlp = English() + with make_tempdir() as tmpdir: + output_file = tmpdir / "roundtrip.spacy" + DocBin(docs=[doc]).to_disk(output_file) + reader = Corpus(output_file, augmenter=create_spongebob_augmenter()) + corpus = list(reader(nlp)) + orig_text = "Sarah 's sister flew to Silicon Valley via London . " + augmented = "SaRaH 's sIsTeR FlEw tO SiLiCoN VaLlEy vIa lOnDoN . " + assert corpus[0].text == orig_text + assert corpus[0].reference.text == orig_text + assert corpus[0].predicted.text == orig_text + assert corpus[1].text == augmented + assert corpus[1].reference.text == augmented + assert corpus[1].predicted.text == augmented @pytest.mark.skip("Outdated") @@ -599,7 +647,7 @@ def _train_tuples(train_data): train_examples = [] for t in train_data: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - optimizer = nlp.begin_training() + optimizer = nlp.initialize() for i in range(5): losses = {} batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 2d4e9af9d..11eb75821 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -58,7 +58,7 @@ class DocBin: attrs (Iterable[str]): List of attributes to serialize. 'orth' and 'spacy' are always serialized, so they're not required. - store_user_data (bool): Whether to include the `Doc.user_data`. + store_user_data (bool): Whether to write the `Doc.user_data` to bytes/file. docs (Iterable[Doc]): Docs to add. DOCS: https://nightly.spacy.io/api/docbin#init @@ -101,16 +101,17 @@ class DocBin: self.strings.add(token.text) self.strings.add(token.tag_) self.strings.add(token.lemma_) - self.strings.add(token.morph_) + self.strings.add(str(token.morph)) self.strings.add(token.dep_) self.strings.add(token.ent_type_) self.strings.add(token.ent_kb_id_) self.cats.append(doc.cats) - if self.store_user_data: - self.user_data.append(srsly.msgpack_dumps(doc.user_data)) + self.user_data.append(srsly.msgpack_dumps(doc.user_data)) def get_docs(self, vocab: Vocab) -> Iterator[Doc]: """Recover Doc objects from the annotations, using the given vocab. + Note that the user data of each doc will be read (if available) and returned, + regardless of the setting of 'self.store_user_data'. vocab (Vocab): The shared vocab. YIELDS (Doc): The Doc objects. @@ -129,7 +130,7 @@ class DocBin: doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) doc = doc.from_array(self.attrs, tokens) doc.cats = self.cats[i] - if self.store_user_data: + if i < len(self.user_data) and self.user_data[i] is not None: user_data = srsly.msgpack_loads(self.user_data[i], use_list=False) doc.user_data.update(user_data) yield doc @@ -137,21 +138,31 @@ class DocBin: def merge(self, other: "DocBin") -> None: """Extend the annotations of this DocBin with the annotations from another. Will raise an error if the pre-defined attrs of the two - DocBins don't match. + DocBins don't match, or if they differ in whether or not to store + user data. other (DocBin): The DocBin to merge into the current bin. DOCS: https://nightly.spacy.io/api/docbin#merge """ if self.attrs != other.attrs: - raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs)) + raise ValueError( + Errors.E166.format(param="attrs", current=self.attrs, other=other.attrs) + ) + if self.store_user_data != other.store_user_data: + raise ValueError( + Errors.E166.format( + param="store_user_data", + current=self.store_user_data, + other=other.store_user_data, + ) + ) self.tokens.extend(other.tokens) self.spaces.extend(other.spaces) self.strings.update(other.strings) self.cats.extend(other.cats) self.flags.extend(other.flags) - if self.store_user_data: - self.user_data.extend(other.user_data) + self.user_data.extend(other.user_data) def to_bytes(self) -> bytes: """Serialize the DocBin's annotations to a bytestring. @@ -200,8 +211,10 @@ class DocBin: self.spaces = NumpyOps().unflatten(flat_spaces, lengths) self.cats = msg["cats"] self.flags = msg.get("flags", [{} for _ in lengths]) - if self.store_user_data and "user_data" in msg: + if "user_data" in msg: self.user_data = list(msg["user_data"]) + else: + self.user_data = [None] * len(self) for tokens in self.tokens: assert len(tokens.shape) == 2, tokens.shape # this should never happen return self diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index b4027f87e..9dfa6e714 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -213,8 +213,9 @@ cdef class Doc: sent_starts (Optional[List[Union[bool, None]]]): A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to None. - ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of - (label, start, end) tuples to assign as doc.ents. Defaults to None. + ents (Optional[List[str]]): A list of unicode strings, of the same + length as words, as IOB tags to assign as token.ent_iob and + token.ent_type. Defaults to None. DOCS: https://nightly.spacy.io/api/doc#init """ @@ -275,16 +276,55 @@ cdef class Doc: sent_starts[i] = -1 elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]: sent_starts[i] = 0 + ent_iobs = None + ent_types = None + if ents is not None: + iob_strings = Token.iob_strings() + # make valid IOB2 out of IOB1 or IOB2 + for i, ent in enumerate(ents): + if ent is "": + ents[i] = None + elif ent is not None and not isinstance(ent, str): + raise ValueError(Errors.E177.format(tag=ent)) + if i < len(ents) - 1: + # OI -> OB + if (ent is None or ent.startswith("O")) and \ + (ents[i+1] is not None and ents[i+1].startswith("I")): + ents[i+1] = "B" + ents[i+1][1:] + # B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2 + if ent is not None and ents[i+1] is not None and \ + (ent.startswith("B") or ent.startswith("I")) and \ + ents[i+1].startswith("I") and \ + ent[1:] != ents[i+1][1:]: + ents[i+1] = "B" + ents[i+1][1:] + ent_iobs = [] + ent_types = [] + for ent in ents: + if ent is None: + ent_iobs.append(iob_strings.index("")) + ent_types.append("") + elif ent == "O": + ent_iobs.append(iob_strings.index(ent)) + ent_types.append("") + else: + if len(ent) < 3 or ent[1] != "-": + raise ValueError(Errors.E177.format(tag=ent)) + ent_iob, ent_type = ent.split("-", 1) + if ent_iob not in iob_strings: + raise ValueError(Errors.E177.format(tag=ent)) + ent_iob = iob_strings.index(ent_iob) + ent_iobs.append(ent_iob) + ent_types.append(ent_type) headings = [] values = [] - annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts] - possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START] + annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types] + possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE] for a, annot in enumerate(annotations): if annot is not None: if len(annot) != len(words): raise ValueError(Errors.E189) headings.append(possible_headings[a]) - if annot is not heads and annot is not sent_starts: + if annot is not heads and annot is not sent_starts and annot is not ent_iobs: values.extend(annot) for value in values: self.vocab.strings.add(value) @@ -296,7 +336,7 @@ cdef class Doc: j = 0 for annot in annotations: if annot: - if annot is heads or annot is sent_starts: + if annot is heads or annot is sent_starts or annot is ent_iobs: for i in range(len(words)): if attrs.ndim == 1: attrs[i] = annot[i] @@ -317,8 +357,6 @@ cdef class Doc: attrs[i, j] = self.vocab.strings[annot[i]] j += 1 self.from_array(headings, attrs) - if ents is not None: - self.ents = ents @property def _(self): @@ -1210,7 +1248,7 @@ cdef class Doc: for token in self: strings.add(token.tag_) strings.add(token.lemma_) - strings.add(token.morph_) + strings.add(str(token.morph)) strings.add(token.dep_) strings.add(token.ent_type_) strings.add(token.ent_kb_id_) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 239de4559..2075c3cc8 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -215,20 +215,22 @@ cdef class Token: def __get__(self): return MorphAnalysis.from_id(self.vocab, self.c.morph) - def __set__(self, attr_t morph): - if morph == 0: - self.c.morph = morph - elif morph in self.vocab.strings: - self.morph_ = self.vocab.strings[morph] - else: - raise ValueError(Errors.E1009.format(val=morph)) + def __set__(self, MorphAnalysis morph): + # Check that the morph has the same vocab + if self.vocab != morph.vocab: + raise ValueError(Errors.E1013) + self.c.morph = morph.c.key - property morph_: - def __get__(self): - return str(MorphAnalysis.from_id(self.vocab, self.c.morph)) - - def __set__(self, features): - cdef hash_t key = self.vocab.morphology.add(features) + def set_morph(self, features): + cdef hash_t key + if features is None: + self.c.morph = 0 + elif isinstance(features, MorphAnalysis): + self.morph = features + else: + if isinstance(features, int): + features = self.vocab.strings[features] + key = self.vocab.morphology.add(features) self.c.morph = key @property diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 4d487ce93..8965c5457 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -1,29 +1,57 @@ -from typing import Callable +from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING import random import itertools import copy from functools import partial -from ..util import registry +from pydantic import BaseModel, StrictStr + +from ..util import registry, logger +from ..tokens import Doc +from .example import Example + +if TYPE_CHECKING: + from ..language import Language # noqa: F401 -@registry.augmenters("spacy.dont_augment.v1") -def create_null_augmenter(): - return dont_augment +class OrthVariantsSingle(BaseModel): + tags: List[StrictStr] + variants: List[StrictStr] + + +class OrthVariantsPaired(BaseModel): + tags: List[StrictStr] + variants: List[List[StrictStr]] + + +class OrthVariants(BaseModel): + paired: List[OrthVariantsPaired] = {} + single: List[OrthVariantsSingle] = {} @registry.augmenters("spacy.orth_variants.v1") -def create_orth_variants_augmenter(level: float, lower: float) -> Callable: +def create_orth_variants_augmenter( + level: float, lower: float, orth_variants: OrthVariants, +) -> Callable[["Language", Example], Iterator[Example]]: """Create a data augmentation callback that uses orth-variant replacement. The callback can be added to a corpus or other data iterator during training. """ - return partial(orth_variants_augmenter, level=level, lower=lower) + return partial( + orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower + ) -def dont_augment(nlp, example): +def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]: yield example -def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0): +def orth_variants_augmenter( + nlp: "Language", + example: Example, + orth_variants: dict, + *, + level: float = 0.0, + lower: float = 0.0, +) -> Iterator[Example]: if random.random() >= level: yield example else: @@ -36,18 +64,31 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0. nlp, raw_text, orig_dict["token_annotation"], - lower=raw_text is not None and random.random() < lower + orth_variants, + lower=raw_text is not None and random.random() < lower, ) - doc = nlp.make_doc(variant_text) + if variant_text: + doc = nlp.make_doc(variant_text) + else: + doc = Doc(nlp.vocab, words=variant_token_annot["ORTH"]) + variant_token_annot["ORTH"] = [w.text for w in doc] + variant_token_annot["SPACY"] = [w.whitespace_ for w in doc] orig_dict["token_annotation"] = variant_token_annot yield example.from_dict(doc, orig_dict) -def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False): +def make_orth_variants( + nlp: "Language", + raw: str, + token_dict: Dict[str, List[str]], + orth_variants: Dict[str, List[Dict[str, List[str]]]], + *, + lower: bool = False, +) -> Tuple[str, Dict[str, List[str]]]: orig_token_dict = copy.deepcopy(token_dict) - orth_variants = nlp.vocab.lookups.get_table("orth_variants", {}) ndsv = orth_variants.get("single", []) ndpv = orth_variants.get("paired", []) + logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants") words = token_dict.get("words", []) tags = token_dict.get("tags", []) # keep unmodified if words or tags are not defined diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py index 18a2b6a93..2e6084ae5 100644 --- a/spacy/training/converters/conllu_to_docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -207,6 +207,7 @@ def conllu_sentence_to_doc( pos=poses, deps=deps, lemmas=lemmas, + morphs=morphs, heads=heads, ) for i in range(len(doc)): diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 90eb62474..b3ff30e66 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -7,7 +7,7 @@ import srsly from .. import util from .augment import dont_augment from .example import Example -from ..errors import Warnings +from ..errors import Warnings, Errors from ..tokens import DocBin, Doc from ..vocab import Vocab @@ -20,12 +20,15 @@ FILE_TYPE = ".spacy" @util.registry.readers("spacy.Corpus.v1") def create_docbin_reader( - path: Path, + path: Optional[Path], gold_preproc: bool, max_length: int = 0, limit: int = 0, augmenter: Optional[Callable] = None, ) -> Callable[["Language"], Iterable[Example]]: + if path is None: + raise ValueError(Errors.E913) + util.logger.debug(f"Loading corpus from path: {path}") return Corpus( path, gold_preproc=gold_preproc, @@ -35,11 +38,20 @@ def create_docbin_reader( ) -@util.registry.readers("spacy.JsonlReader.v1") +@util.registry.readers("spacy.JsonlCorpus.v1") def create_jsonl_reader( path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0 ) -> Callable[["Language"], Iterable[Doc]]: - return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit) + return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit) + + +@util.registry.readers("spacy.read_labels.v1") +def read_labels(path: Path, *, require: bool = False): + # I decided not to give this a generic name, because I don't want people to + # use it for arbitrary stuff, as I want this require arg with default False. + if not require and not path.exists(): + return None + return srsly.read_json(path) def walk_corpus(path: Union[str, Path], file_type) -> List[Path]: @@ -181,7 +193,7 @@ class Corpus: break -class JsonlTexts: +class JsonlCorpus: """Iterate Doc objects from a file or directory of jsonl formatted raw text files. @@ -194,7 +206,7 @@ class JsonlTexts: limit (int): Limit corpus to a subset of examples, e.g. for debugging. Defaults to 0, which indicates no limit. - DOCS: https://nightly.spacy.io/api/corpus#jsonltexts + DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus """ file_type = "jsonl" @@ -218,7 +230,7 @@ class JsonlTexts: nlp (Language): The current nlp object. YIELDS (Example): The example objects. - DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call + DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus-call """ for loc in walk_corpus(self.path, ".jsonl"): records = srsly.read_jsonl(loc) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index f2c78203a..f6225135c 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -1,4 +1,4 @@ -from collections import Iterable as IterableInstance +from collections.abc import Iterable as IterableInstance import warnings import numpy from murmurhash.mrmr cimport hash64 @@ -226,7 +226,7 @@ cdef class Example: "TAG": [t.tag_ for t in self.reference], "LEMMA": [t.lemma_ for t in self.reference], "POS": [t.pos_ for t in self.reference], - "MORPH": [t.morph_ for t in self.reference], + "MORPH": [str(t.morph) for t in self.reference], "HEAD": [t.head.i for t in self.reference], "DEP": [t.dep_ for t in self.reference], "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference] diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx index 8b9f5ab2b..8fb6b8565 100644 --- a/spacy/training/gold_io.pyx +++ b/spacy/training/gold_io.pyx @@ -44,7 +44,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): if include_annotation["POS"]: json_token["pos"] = token.pos_ if include_annotation["MORPH"]: - json_token["morph"] = token.morph_ + json_token["morph"] = str(token.morph) if include_annotation["LEMMA"]: json_token["lemma"] = token.lemma_ if include_annotation["DEP"]: diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py new file mode 100644 index 000000000..d64f211c4 --- /dev/null +++ b/spacy/training/initialize.py @@ -0,0 +1,289 @@ +from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING +from thinc.api import Config, fix_random_seed, set_gpu_allocator +from thinc.api import ConfigValidationError +from pathlib import Path +import srsly +import numpy +import tarfile +import gzip +import zipfile +import tqdm + +from ..lookups import Lookups +from ..vectors import Vectors +from ..errors import Errors +from ..schemas import ConfigSchemaTraining +from ..util import registry, load_model_from_config, resolve_dot_names, logger +from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB + +if TYPE_CHECKING: + from ..language import Language # noqa: F401 + + +def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": + raw_config = config + config = raw_config.interpolate() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + # Use original config here before it's resolved to functions + sourced_components = get_sourced_components(config) + nlp = load_model_from_config(raw_config, auto_fill=True) + logger.info("Set up nlp object from config") + config = nlp.config.interpolate() + # Resolve all training-relevant sections using the filled nlp config + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(config, dot_names) + optimizer = T["optimizer"] + # Components that shouldn't be updated during training + frozen_components = T["frozen_components"] + # Sourced components that require resume_training + resume_components = [p for p in sourced_components if p not in frozen_components] + logger.info(f"Pipeline: {nlp.pipe_names}") + if resume_components: + with nlp.select_pipes(enable=resume_components): + logger.info(f"Resuming training for: {resume_components}") + nlp.resume_training(sgd=optimizer) + with nlp.select_pipes(disable=[*frozen_components, *resume_components]): + nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) + logger.info("Initialized pipeline components") + # Verify the config after calling 'initialize' to ensure labels + # are properly initialized + verify_config(nlp) + return nlp + + +def init_vocab( + nlp: "Language", + *, + data: Optional[Path] = None, + lookups: Optional[Lookups] = None, + vectors: Optional[str] = None, +) -> "Language": + if lookups: + nlp.vocab.lookups = lookups + logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") + data_path = ensure_path(data) + if data_path is not None: + lex_attrs = srsly.read_jsonl(data_path) + for lexeme in nlp.vocab: + lexeme.rank = OOV_RANK + for attrs in lex_attrs: + if "settings" in attrs: + continue + lexeme = nlp.vocab[attrs["orth"]] + lexeme.set_attrs(**attrs) + if len(nlp.vocab): + oov_prob = min(lex.prob for lex in nlp.vocab) - 1 + else: + oov_prob = DEFAULT_OOV_PROB + nlp.vocab.cfg.update({"oov_prob": oov_prob}) + logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") + logger.info("Created vocabulary") + if vectors is not None: + load_vectors_into_model(nlp, vectors) + logger.info(f"Added vectors: {vectors}") + logger.info("Finished initializing nlp object") + + +def load_vectors_into_model( + nlp: "Language", name: Union[str, Path], *, add_strings: bool = True +) -> None: + """Load word vectors from an installed model or path into a model instance.""" + try: + vectors_nlp = load_model(name) + except ConfigValidationError as e: + title = f"Config validation error for vectors {name}" + desc = ( + "This typically means that there's a problem in the config.cfg included " + "with the packaged vectors. Make sure that the vectors package you're " + "loading is compatible with the current version of spaCy." + ) + err = ConfigValidationError.from_error(config=None, title=title, desc=desc) + raise err from None + nlp.vocab.vectors = vectors_nlp.vocab.vectors + if add_strings: + # I guess we should add the strings from the vectors_nlp model? + # E.g. if someone does a similarity query, they might expect the strings. + for key in nlp.vocab.vectors.key2row: + if key in vectors_nlp.vocab.strings: + nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) + + +def init_tok2vec( + nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any] +) -> bool: + # Load pretrained tok2vec weights - cf. CLI command 'pretrain' + P = pretrain_config + I = init_config + weights_data = None + init_tok2vec = ensure_path(I["init_tok2vec"]) + if init_tok2vec is not None: + if P["objective"].get("type") == "vectors" and not I["vectors"]: + err = 'need initialize.vectors if pretraining.objective.type is "vectors"' + errors = [{"loc": ["initialize"], "msg": err}] + raise ConfigValidationError(config=nlp.config, errors=errors) + if not init_tok2vec.exists(): + err = f"can't find pretrained tok2vec: {init_tok2vec}" + errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}] + raise ConfigValidationError(config=nlp.config, errors=errors) + with init_tok2vec.open("rb") as file_: + weights_data = file_.read() + if weights_data is not None: + tok2vec_component = P["component"] + if tok2vec_component is None: + desc = ( + f"To use pretrained tok2vec weights, [pretraining.component] " + f"needs to specify the component that should load them." + ) + err = "component can't be null" + errors = [{"loc": ["pretraining", "component"], "msg": err}] + raise ConfigValidationError( + config=nlp.config["pretraining"], errors=errors, desc=desc + ) + layer = nlp.get_pipe(tok2vec_component).model + if P["layer"]: + layer = layer.get_ref(P["layer"]) + layer.from_bytes(weights_data) + return True + return False + + +def verify_config(nlp: "Language") -> None: + """Perform additional checks based on the config, loaded nlp object and training data.""" + # TODO: maybe we should validate based on the actual components, the list + # in config["nlp"]["pipeline"] instead? + for pipe_config in nlp.config["components"].values(): + # We can't assume that the component name == the factory + factory = pipe_config["factory"] + if factory == "textcat": + verify_textcat_config(nlp, pipe_config) + + +def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None: + # if 'positive_label' is provided: double check whether it's in the data and + # the task is binary + if pipe_config.get("positive_label"): + textcat_labels = nlp.get_pipe("textcat").labels + pos_label = pipe_config.get("positive_label") + if pos_label not in textcat_labels: + raise ValueError( + Errors.E920.format(pos_label=pos_label, labels=textcat_labels) + ) + if len(list(textcat_labels)) != 2: + raise ValueError( + Errors.E919.format(pos_label=pos_label, labels=textcat_labels) + ) + + +def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: + """RETURNS (List[str]): All sourced components in the original config, + e.g. {"source": "en_core_web_sm"}. If the config contains a key + "factory", we assume it refers to a component factory. + """ + return [ + name + for name, cfg in config.get("components", {}).items() + if "factory" not in cfg and "source" in cfg + ] + + +def convert_vectors( + nlp: "Language", + vectors_loc: Optional[Path], + *, + truncate: int, + prune: int, + name: Optional[str] = None, +) -> None: + vectors_loc = ensure_path(vectors_loc) + if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): + nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) + for lex in nlp.vocab: + if lex.rank and lex.rank != OOV_RANK: + nlp.vocab.vectors.add(lex.orth, row=lex.rank) + else: + if vectors_loc: + logger.info(f"Reading vectors from {vectors_loc}") + vectors_data, vector_keys = read_vectors(vectors_loc, truncate) + logger.info(f"Loaded vectors from {vectors_loc}") + else: + vectors_data, vector_keys = (None, None) + if vector_keys is not None: + for word in vector_keys: + if word not in nlp.vocab: + nlp.vocab[word] + if vectors_data is not None: + nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) + if name is None: + # TODO: Is this correct? Does this matter? + nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" + else: + nlp.vocab.vectors.name = name + nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name + if prune >= 1: + nlp.vocab.prune_vectors(prune) + + +def read_vectors(vectors_loc: Path, truncate_vectors: int): + f = open_file(vectors_loc) + f = ensure_shape(f) + shape = tuple(int(size) for size in next(f).split()) + if truncate_vectors >= 1: + shape = (truncate_vectors, shape[1]) + vectors_data = numpy.zeros(shape=shape, dtype="f") + vectors_keys = [] + for i, line in enumerate(tqdm.tqdm(f)): + line = line.rstrip() + pieces = line.rsplit(" ", vectors_data.shape[1]) + word = pieces.pop(0) + if len(pieces) != vectors_data.shape[1]: + raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc)) + vectors_data[i] = numpy.asarray(pieces, dtype="f") + vectors_keys.append(word) + if i == truncate_vectors - 1: + break + return vectors_data, vectors_keys + + +def open_file(loc: Union[str, Path]) -> IO: + """Handle .gz, .tar.gz or unzipped files""" + loc = ensure_path(loc) + if tarfile.is_tarfile(str(loc)): + return tarfile.open(str(loc), "r:gz") + elif loc.parts[-1].endswith("gz"): + return (line.decode("utf8") for line in gzip.open(str(loc), "r")) + elif loc.parts[-1].endswith("zip"): + zip_file = zipfile.ZipFile(str(loc)) + names = zip_file.namelist() + file_ = zip_file.open(names[0]) + return (line.decode("utf8") for line in file_) + else: + return loc.open("r", encoding="utf8") + + +def ensure_shape(lines): + """Ensure that the first line of the data is the vectors shape. + If it's not, we read in the data and output the shape as the first result, + so that the reader doesn't have to deal with the problem. + """ + first_line = next(lines) + try: + shape = tuple(int(size) for size in first_line.split()) + except ValueError: + shape = None + if shape is not None: + # All good, give the data + yield first_line + yield from lines + else: + # Figure out the shape, make it the first value, and then give the + # rest of the data. + width = len(first_line.split()) - 1 + captured = [first_line] + list(lines) + length = len(captured) + yield f"{length} {width}" + yield from captured diff --git a/spacy/training/loop.py b/spacy/training/loop.py new file mode 100644 index 000000000..e20cddd3e --- /dev/null +++ b/spacy/training/loop.py @@ -0,0 +1,304 @@ +from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any +from typing import Optional, TYPE_CHECKING +from pathlib import Path +from timeit import default_timer as timer +from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator +import random +import tqdm +from wasabi import Printer + +from .example import Example +from ..schemas import ConfigSchemaTraining +from ..errors import Errors +from ..util import resolve_dot_names, registry + +if TYPE_CHECKING: + from ..language import Language # noqa: F401 + + +def train( + nlp: "Language", + output_path: Optional[Path] = None, + *, + use_gpu: int = -1, + silent: bool = False, +) -> None: + """Train a pipeline. + + nlp (Language): The initialized nlp object with the full config. + output_path (Path): Optional output path to save trained model to. + use_gpu (int): Whether to train on GPU. Make sure to call require_gpu + before calling this function. + silent (bool): Whether to pretty-print outputs. + RETURNS (Path / None): The path to the final exported model. + """ + msg = Printer(no_print=silent) + # Create iterator, which yields out info after each optimization step. + config = nlp.config.interpolate() + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(config, dot_names) + optimizer = T["optimizer"] + score_weights = T["score_weights"] + batcher = T["batcher"] + train_logger = T["logger"] + before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) + # Components that shouldn't be updated during training + frozen_components = T["frozen_components"] + # Create iterator, which yields out info after each optimization step. + training_step_iterator = train_while_improving( + nlp, + optimizer, + create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]), + create_evaluation_callback(nlp, dev_corpus, score_weights), + dropout=T["dropout"], + accumulate_gradient=T["accumulate_gradient"], + patience=T["patience"], + max_steps=T["max_steps"], + eval_frequency=T["eval_frequency"], + exclude=frozen_components, + ) + msg.info(f"Pipeline: {nlp.pipe_names}") + if frozen_components: + msg.info(f"Frozen components: {frozen_components}") + msg.info(f"Initial learn rate: {optimizer.learn_rate}") + with nlp.select_pipes(disable=frozen_components): + print_row, finalize_logger = train_logger(nlp) + try: + progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) + progress.set_description(f"Epoch 1") + for batch, info, is_best_checkpoint in training_step_iterator: + progress.update(1) + if is_best_checkpoint is not None: + progress.close() + print_row(info) + if is_best_checkpoint and output_path is not None: + with nlp.select_pipes(disable=frozen_components): + update_meta(T, nlp, info) + with nlp.use_params(optimizer.averages): + nlp = before_to_disk(nlp) + nlp.to_disk(output_path / "model-best") + progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) + progress.set_description(f"Epoch {info['epoch']}") + except Exception as e: + finalize_logger() + if output_path is not None: + # We don't want to swallow the traceback if we don't have a + # specific error. + msg.warn( + f"Aborting and saving the final best model. " + f"Encountered exception: {str(e)}" + ) + nlp = before_to_disk(nlp) + nlp.to_disk(output_path / "model-final") + raise e + finally: + finalize_logger() + if output_path is not None: + final_model_path = output_path / "model-final" + if optimizer.averages: + with nlp.use_params(optimizer.averages): + nlp.to_disk(final_model_path) + else: + nlp.to_disk(final_model_path) + msg.good(f"Saved pipeline to output directory", final_model_path) + + +def train_while_improving( + nlp: "Language", + optimizer: Optimizer, + train_data, + evaluate, + *, + dropout: float, + eval_frequency: int, + accumulate_gradient: int, + patience: int, + max_steps: int, + exclude: List[str], +): + """Train until an evaluation stops improving. Works as a generator, + with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, + where info is a dict, and is_best_checkpoint is in [True, False, None] -- + None indicating that the iteration was not evaluated as a checkpoint. + The evaluation is conducted by calling the evaluate callback. + + Positional arguments: + nlp: The spaCy pipeline to evaluate. + optimizer: The optimizer callable. + train_data (Iterable[Batch]): A generator of batches, with the training + data. Each batch should be a Sized[Tuple[Input, Annot]]. The training + data iterable needs to take care of iterating over the epochs and + shuffling. + evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation. + The callback should take no arguments and return a tuple + `(main_score, other_scores)`. The main_score should be a float where + higher is better. other_scores can be any object. + + Every iteration, the function yields out a tuple with: + + * batch: A list of Example objects. + * info: A dict with various information about the last update (see below). + * is_best_checkpoint: A value in None, False, True, indicating whether this + was the best evaluation so far. You should use this to save the model + checkpoints during training. If None, evaluation was not conducted on + that iteration. False means evaluation was conducted, but a previous + evaluation was better. + + The info dict provides the following information: + + epoch (int): How many passes over the data have been completed. + step (int): How many steps have been completed. + score (float): The main score from the last evaluation. + other_scores: : The other scores from the last evaluation. + losses: The accumulated losses throughout training. + checkpoints: A list of previous results, where each result is a + (score, step, epoch) tuple. + """ + if isinstance(dropout, float): + dropouts = constant(dropout) + else: + dropouts = dropout + results = [] + losses = {} + words_seen = 0 + start_time = timer() + for step, (epoch, batch) in enumerate(train_data): + dropout = next(dropouts) + for subbatch in subdivide_batch(batch, accumulate_gradient): + nlp.update( + subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude + ) + # TODO: refactor this so we don't have to run it separately in here + for name, proc in nlp.pipeline: + if ( + name not in exclude + and hasattr(proc, "model") + and proc.model not in (True, False, None) + ): + proc.model.finish_update(optimizer) + optimizer.step_schedules() + if not (step % eval_frequency): + if optimizer.averages: + with nlp.use_params(optimizer.averages): + score, other_scores = evaluate() + else: + score, other_scores = evaluate() + results.append((score, step)) + is_best_checkpoint = score == max(results)[0] + else: + score, other_scores = (None, None) + is_best_checkpoint = None + words_seen += sum(len(eg) for eg in batch) + info = { + "epoch": epoch, + "step": step, + "score": score, + "other_scores": other_scores, + "losses": losses, + "checkpoints": results, + "seconds": int(timer() - start_time), + "words": words_seen, + } + yield batch, info, is_best_checkpoint + if is_best_checkpoint is not None: + losses = {} + # Stop if no improvement in `patience` updates (if specified) + best_score, best_step = max(results) + if patience and (step - best_step) >= patience: + break + # Stop if we've exhausted our max steps (if specified) + if max_steps and step >= max_steps: + break + + +def subdivide_batch(batch, accumulate_gradient): + batch = list(batch) + batch.sort(key=lambda eg: len(eg.predicted)) + sub_len = len(batch) // accumulate_gradient + start = 0 + for i in range(accumulate_gradient): + subbatch = batch[start : start + sub_len] + if subbatch: + yield subbatch + start += len(subbatch) + subbatch = batch[start:] + if subbatch: + yield subbatch + + +def create_evaluation_callback( + nlp: "Language", dev_corpus: Callable, weights: Dict[str, float] +) -> Callable[[], Tuple[float, Dict[str, float]]]: + weights = {key: value for key, value in weights.items() if value is not None} + + def evaluate() -> Tuple[float, Dict[str, float]]: + dev_examples = list(dev_corpus(nlp)) + scores = nlp.evaluate(dev_examples) + # Calculate a weighted sum based on score_weights for the main score. + # We can only consider scores that are ints/floats, not dicts like + # entity scores per type etc. + for key, value in scores.items(): + if key in weights and not isinstance(value, (int, float)): + raise ValueError(Errors.E915.format(name=key, score_type=type(value))) + try: + weighted_score = sum( + scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights + ) + except KeyError as e: + keys = list(scores.keys()) + err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) + raise KeyError(err) from None + return weighted_score, scores + + return evaluate + + +def create_train_batches( + iterator: Iterator[Example], + batcher: Callable[[Iterable[Example]], Iterable[Example]], + max_epochs: int, +): + epoch = 0 + examples = list(iterator) + if not examples: + # Raise error if no data + raise ValueError(Errors.E986) + while max_epochs < 1 or epoch != max_epochs: + random.shuffle(examples) + for batch in batcher(examples): + yield epoch, batch + epoch += 1 + + +def update_meta( + training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any] +) -> None: + nlp.meta["performance"] = {} + for metric in training["score_weights"]: + if metric is not None: + nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) + for pipe_name in nlp.pipe_names: + nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] + + +def create_before_to_disk_callback( + callback: Optional[Callable[["Language"], "Language"]] +) -> Callable[["Language"], "Language"]: + from ..language import Language # noqa: F811 + + def before_to_disk(nlp: Language) -> Language: + if not callback: + return nlp + modified_nlp = callback(nlp) + if not isinstance(modified_nlp, Language): + err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) + raise ValueError(err) + return modified_nlp + + return before_to_disk diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py new file mode 100644 index 000000000..4f05c6344 --- /dev/null +++ b/spacy/training/pretrain.py @@ -0,0 +1,266 @@ +from typing import Optional, Callable, Iterable, Union, List +from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer +from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance +from pathlib import Path +from functools import partial +from collections import Counter +import srsly +import numpy +import time +import re +from wasabi import Printer + +from .example import Example +from ..tokens import Doc +from ..attrs import ID +from ..ml.models.multi_task import build_cloze_multi_task_model +from ..ml.models.multi_task import build_cloze_characters_multi_task_model +from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain +from ..util import registry, load_model_from_config, dot_to_object + + +def pretrain( + config: Config, + output_dir: Path, + resume_path: Optional[Path] = None, + epoch_resume: Optional[int] = None, + use_gpu: int = -1, + silent: bool = True, +): + msg = Printer(no_print=silent) + if config["training"]["seed"] is not None: + fix_random_seed(config["training"]["seed"]) + allocator = config["training"]["gpu_allocator"] + if use_gpu >= 0 and allocator: + set_gpu_allocator(allocator) + nlp = load_model_from_config(config) + _config = nlp.config.interpolate() + T = registry.resolve(_config["training"], schema=ConfigSchemaTraining) + P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) + corpus = dot_to_object(T, P["corpus"]) + batcher = P["batcher"] + model = create_pretraining_model(nlp, P) + optimizer = P["optimizer"] + # Load in pretrained weights to resume from + if resume_path is not None: + _resume_model(model, resume_path, epoch_resume, silent=silent) + else: + # Without '--resume-path' the '--epoch-resume' argument is ignored + epoch_resume = 0 + # TODO: move this to logger function? + tracker = ProgressTracker(frequency=10000) + msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") + row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} + msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) + + def _save_model(epoch, is_temp=False): + is_temp_str = ".temp" if is_temp else "" + with model.use_params(optimizer.averages): + with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: + file_.write(model.get_ref("tok2vec").to_bytes()) + log = { + "nr_word": tracker.nr_word, + "loss": tracker.loss, + "epoch_loss": tracker.epoch_loss, + "epoch": epoch, + } + with (output_dir / "log.jsonl").open("a") as file_: + file_.write(srsly.json_dumps(log) + "\n") + + objective = create_objective(P["objective"]) + # TODO: I think we probably want this to look more like the + # 'create_train_batches' function? + for epoch in range(epoch_resume, P["max_epochs"]): + for batch_id, batch in enumerate(batcher(corpus(nlp))): + docs = ensure_docs(batch) + loss = make_update(model, docs, optimizer, objective) + progress = tracker.update(epoch, loss, docs) + if progress: + msg.row(progress, **row_settings) + if P["n_save_every"] and (batch_id % P["n_save_every"] == 0): + _save_model(epoch, is_temp=True) + _save_model(epoch) + tracker.epoch_loss = 0.0 + + +def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]: + docs = [] + for eg_or_doc in examples_or_docs: + if isinstance(eg_or_doc, Doc): + docs.append(eg_or_doc) + else: + docs.append(eg_or_doc.reference) + return docs + + +def _resume_model( + model: Model, resume_path: Path, epoch_resume: int, silent: bool = True +) -> None: + msg = Printer(no_print=silent) + msg.info(f"Resume training tok2vec from: {resume_path}") + with resume_path.open("rb") as file_: + weights_data = file_.read() + model.get_ref("tok2vec").from_bytes(weights_data) + # Parse the epoch number from the given weight file + model_name = re.search(r"model\d+\.bin", str(resume_path)) + if model_name: + # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' + epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 + msg.info(f"Resuming from epoch: {epoch_resume}") + else: + msg.info(f"Resuming from epoch: {epoch_resume}") + + +def make_update( + model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable +) -> float: + """Perform an update over a single batch of documents. + + docs (iterable): A batch of `Doc` objects. + optimizer (callable): An optimizer. + RETURNS loss: A float for the loss. + """ + predictions, backprop = model.begin_update(docs) + loss, gradients = objective_func(model.ops, docs, predictions) + backprop(gradients) + model.finish_update(optimizer) + # Don't want to return a cupy object here + # The gradients are modified in-place by the BERT MLM, + # so we get an accurate loss + return float(loss) + + +def create_objective(config: Config): + """Create the objective for pretraining. + + We'd like to replace this with a registry function but it's tricky because + we're also making a model choice based on this. For now we hard-code support + for two types (characters, vectors). For characters you can specify + n_characters, for vectors you can specify the loss. + + Bleh. + """ + objective_type = config["type"] + if objective_type == "characters": + return partial(get_characters_loss, nr_char=config["n_characters"]) + elif objective_type == "vectors": + if config["loss"] == "cosine": + distance = CosineDistance(normalize=True, ignore_zeros=True) + return partial(get_vectors_loss, distance=distance) + elif config["loss"] == "L2": + distance = L2Distance(normalize=True, ignore_zeros=True) + return partial(get_vectors_loss, distance=distance) + else: + raise ValueError("Unexpected loss type", config["loss"]) + else: + raise ValueError("Unexpected objective_type", objective_type) + + +def get_vectors_loss(ops, docs, prediction, distance): + """Compute a loss based on a distance between the documents' vectors and + the prediction. + """ + # The simplest way to implement this would be to vstack the + # token.vector values, but that's a bit inefficient, especially on GPU. + # Instead we fetch the index into the vectors table for each of our tokens, + # and look them up all at once. This prevents data copying. + ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + target = docs[0].vocab.vectors.data[ids] + d_target, loss = distance(prediction, target) + return loss, d_target + + +def get_characters_loss(ops, docs, prediction, nr_char): + """Compute a loss based on a number of characters predicted from the docs.""" + target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs]) + target_ids = target_ids.reshape((-1,)) + target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") + target = target.reshape((-1, 256 * nr_char)) + diff = prediction - target + loss = (diff ** 2).sum() + d_target = diff / float(prediction.shape[0]) + return loss, d_target + + +def create_pretraining_model(nlp, pretrain_config): + """Define a network for the pretraining. We simply add an output layer onto + the tok2vec input model. The tok2vec input model needs to be a model that + takes a batch of Doc objects (as a list), and returns a list of arrays. + Each array in the output needs to have one row per token in the doc. + The actual tok2vec layer is stored as a reference, and only this bit will be + serialized to file and read back in when calling the 'train' command. + """ + component = nlp.get_pipe(pretrain_config["component"]) + if pretrain_config.get("layer"): + tok2vec = component.model.get_ref(pretrain_config["layer"]) + else: + tok2vec = component.model + + # TODO + maxout_pieces = 3 + hidden_size = 300 + if pretrain_config["objective"]["type"] == "vectors": + model = build_cloze_multi_task_model( + nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces + ) + elif pretrain_config["objective"]["type"] == "characters": + model = build_cloze_characters_multi_task_model( + nlp.vocab, + tok2vec, + hidden_size=hidden_size, + maxout_pieces=maxout_pieces, + nr_char=pretrain_config["objective"]["n_characters"], + ) + model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) + set_dropout_rate(model, pretrain_config["dropout"]) + return model + + +class ProgressTracker: + def __init__(self, frequency=1000000): + self.loss = 0.0 + self.prev_loss = 0.0 + self.nr_word = 0 + self.words_per_epoch = Counter() + self.frequency = frequency + self.last_time = time.time() + self.last_update = 0 + self.epoch_loss = 0.0 + + def update(self, epoch, loss, docs): + self.loss += loss + self.epoch_loss += loss + words_in_batch = sum(len(doc) for doc in docs) + self.words_per_epoch[epoch] += words_in_batch + self.nr_word += words_in_batch + words_since_update = self.nr_word - self.last_update + if words_since_update >= self.frequency: + wps = words_since_update / (time.time() - self.last_time) + self.last_update = self.nr_word + self.last_time = time.time() + loss_per_word = self.loss - self.prev_loss + status = ( + epoch, + self.nr_word, + _smart_round(self.loss, width=10), + _smart_round(loss_per_word, width=6), + int(wps), + ) + self.prev_loss = float(self.loss) + return status + else: + return None + + +def _smart_round( + figure: Union[float, int], width: int = 10, max_decimal: int = 4 +) -> str: + """Round large numbers as integers, smaller numbers as decimals.""" + n_digits = len(str(int(figure))) + n_decimal = width - (n_digits + 1) + if n_decimal <= 1: + return str(int(figure)) + else: + n_decimal = min(n_decimal, max_decimal) + format_str = "%." + str(n_decimal) + "f" + return format_str % figure diff --git a/spacy/util.py b/spacy/util.py index d919b161e..c43943ef7 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -8,6 +8,7 @@ import re from pathlib import Path import thinc from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer +from thinc.api import ConfigValidationError import functools import itertools import numpy.random @@ -56,19 +57,24 @@ if TYPE_CHECKING: OOV_RANK = numpy.iinfo(numpy.uint64).max +DEFAULT_OOV_PROB = -20 LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config.cfg. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. # fmt: off -CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"] +CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"] # fmt: on -logging.basicConfig() +logging.basicConfig(format="%(message)s") logger = logging.getLogger("spacy") +class ENV_VARS: + CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES" + + class registry(thinc.registry): languages = catalogue.create("spacy", "languages", entry_points=True) architectures = catalogue.create("spacy", "architectures", entry_points=True) @@ -240,28 +246,6 @@ def get_module_path(module: ModuleType) -> Path: return Path(sys.modules[module.__module__].__file__).parent -def load_vectors_into_model( - nlp: "Language", name: Union[str, Path], *, add_strings=True -) -> None: - """Load word vectors from an installed model or path into a model instance.""" - vectors_nlp = load_model(name) - nlp.vocab.vectors = vectors_nlp.vocab.vectors - if add_strings: - # I guess we should add the strings from the vectors_nlp model? - # E.g. if someone does a similarity query, they might expect the strings. - for key in nlp.vocab.vectors.key2row: - if key in vectors_nlp.vocab.strings: - nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) - - -def load_vocab_data_into_model( - nlp: "Language", *, lookups: Optional["Lookups"] = None -) -> None: - """Load vocab data.""" - if lookups: - nlp.vocab.lookups = lookups - - def load_model( name: Union[str, Path], *, @@ -400,27 +384,39 @@ def load_model_from_config( return nlp -def resolve_training_config( - config: Config, - exclude: Iterable[str] = ("nlp", "components"), - validate: bool = True, -) -> Dict[str, Any]: - """Resolve the config sections relevant for trainig and create all objects. - Mostly used in the CLI to separate training config (not resolved by default - because not runtime-relevant – an nlp object should load fine even if it's - [training] block refers to functions that are not available etc.). +def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[Any]: + """Resolve one or more "dot notation" names, e.g. corpora.train. + The paths could point anywhere into the config, so we don't know which + top-level section we'll be looking within. - config (Config): The config to resolve. - exclude (Iterable[str]): The config blocks to exclude. Those blocks won't - be available in the final resolved config. - validate (bool): Whether to validate the config. - RETURNS (Dict[str, Any]): The resolved config. + We resolve the whole top-level section, although we could resolve less -- + we could find the lowest part of the tree. """ - config = config.copy() - for key in exclude: - if key in config: - config.pop(key) - return registry.resolve(config, validate=validate) + # TODO: include schema? + resolved = {} + output = [] + errors = [] + for name in dot_names: + if name is None: + output.append(name) + else: + section = name.split(".")[0] + # We want to avoid resolving the same thing twice + if section not in resolved: + if registry.is_promise(config[section]): + # Otherwise we can't resolve [corpus] if it's a promise + result = registry.resolve({"config": config[section]})["config"] + else: + result = registry.resolve(config[section]) + resolved[section] = result + try: + output.append(dot_to_object(resolved, name)) + except KeyError: + msg = f"not a valid section reference: {name}" + errors.append({"loc": name.split("."), "msg": msg}) + if errors: + raise ConfigValidationError(config=config, errors=errors) + return tuple(output) def load_model_from_init_py( @@ -492,7 +488,7 @@ def load_config_from_str( RETURNS (Config): The loaded config. """ return Config(section_order=CONFIG_SECTION_ORDER).from_str( - text, overrides=overrides, interpolate=interpolate, + text, overrides=overrides, interpolate=interpolate ) @@ -1300,3 +1296,23 @@ def minibatch(items, size): if len(batch) == 0: break yield list(batch) + + +def is_cython_func(func: Callable) -> bool: + """Slightly hacky check for whether a callable is implemented in Cython. + Can be used to implement slightly different behaviors, especially around + inspecting and parameter annotations. Note that this will only return True + for actual cdef functions and methods, not regular Python functions defined + in Python modules. + + func (Callable): The callable to check. + RETURNS (bool): Whether the callable is Cython (probably). + """ + attr = "__pyx_vtable__" + if hasattr(func, attr): # function or class instance + return True + # https://stackoverflow.com/a/55767059 + if hasattr(func, "__qualname__") and hasattr(func, "__module__"): # method + cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] + return hasattr(cls_func, attr) + return False diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index ef2666ec0..5cee45ba5 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -143,11 +143,10 @@ argument that connects to the shared `tok2vec` component in the pipeline. Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it through -a feed-forward subnetwork to build a mixed representations. The features used -are the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying -definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from -pretrained static vectors can also be incorporated into the concatenated -representation. +a feed-forward subnetwork to build mixed representations. The features used are +the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a +[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static +vectors can also be incorporated into the concatenated representation. | Name | Description | | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -170,7 +169,7 @@ representation. > nC = 8 > ``` -Construct an embedded representations based on character embeddings, using a +Construct an embedded representation based on character embeddings, using a feed-forward network. A fixed number of UTF-8 byte characters are used for each word, taken from the beginning and end of the word equally. Padding is used in the center for words that are too short. @@ -292,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details. | `key_attr` | Defaults to `"ORTH"`. ~~str~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ | +### spacy.FeatureExtractor.v1 {#FeatureExtractor} + +> #### Example config +> +> ```ini +> [model] +> @architectures = "spacy.FeatureExtractor.v1" +> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] +> ``` + +Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list +of feature names to extract, which should refer to token attributes. + +| Name |  Description | +| ----------- | ------------------------------------------------------------------------ | +| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ | +| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ | + ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} The following architectures are provided by the package @@ -392,7 +409,7 @@ a single token vector given zero or more wordpiece vectors. > ``` Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does -**not** allow multiple components to share the transformer weights, and does +**not** allow multiple components to share the transformer weights and does **not** allow the transformer to set annotations into the [`Doc`](/api/doc) object, but it's a **simpler solution** if you only need the transformer within one component. @@ -437,7 +454,7 @@ might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python) helpful for background information. The neural network state prediction model consists of either two or three subnetworks: -- **tok2vec**: Map each token into a vector representations. This subnetwork is +- **tok2vec**: Map each token into a vector representation. This subnetwork is run once for each batch. - **lower**: Construct a feature-specific vector for each `(token, feature)` pair. This is also run once for each batch. Constructing the state @@ -517,18 +534,18 @@ specific data and challenge. Stacked ensemble of a bag-of-words model and a neural network model. The neural network has an internal CNN Tok2Vec layer and uses attention. -| Name | Description | -| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | -| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ | -| `width` | Output dimension of the feature encoding step. ~~int~~ | -| `embed_size` | Input dimension of the feature encoding step. ~~int~~ | -| `conv_depth` | Depth of the tok2vec layer. ~~int~~ | -| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ | -| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | -| `dropout` | The dropout rate. ~~float~~ | -| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | +| Name | Description | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ | +| `width` | Output dimension of the feature encoding step. ~~int~~ | +| `embed_size` | Input dimension of the feature encoding step. ~~int~~ | +| `conv_depth` | Depth of the tok2vec layer. ~~int~~ | +| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ | +| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | +| `dropout` | The dropout rate. ~~float~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ### spacy.TextCatCNN.v1 {#TextCatCNN} @@ -555,12 +572,12 @@ A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster. -| Name | Description | -| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | -| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | -| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | +| Name | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ### spacy.TextCatBOW.v1 {#TextCatBOW} @@ -575,16 +592,16 @@ architecture is usually less accurate than the ensemble, but runs faster. > nO = null > ``` -An ngram "bag-of-words" model. This architecture should run much faster than the -others, but may not be as accurate, especially if texts are short. +An n-gram "bag-of-words" model. This architecture should run much faster than +the others, but may not be as accurate, especially if texts are short. -| Name | Description | -| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | -| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | -| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ | -| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | +| Name | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ | +| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} @@ -596,7 +613,7 @@ into the "real world". This requires 3 main components: synonyms and prior probabilities. - A candidate generation step to produce a set of likely identifiers, given a certain textual mention. -- A Machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the +- A machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the most plausible ID from the set of candidates. ### spacy.EntityLinker.v1 {#EntityLinker} @@ -629,11 +646,11 @@ into the "real world". This requires 3 main components: The `EntityLinker` model architecture is a Thinc `Model` with a [`Linear`](https://thinc.ai/api-layers#linear) output layer. -| Name | Description | -| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | -| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | +| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ### spacy.EmptyKB.v1 {#EmptyKB} diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index 53c8c46cf..60fda6bda 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -71,7 +71,7 @@ pattern_dicts = [ ## AttributeRuler.\_\_call\_\_ {#call tag="method"} -Apply the attribute ruler to a Doc, setting token attributes for tokens matched +Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched by the provided patterns. | Name | Description | @@ -256,6 +256,6 @@ serialization by passing in the string names via the `exclude` argument. | Name | Description | | ---------- | -------------------------------------------------------------- | | `vocab` | The shared [`Vocab`](/api/vocab). | -| `patterns` | The Matcher patterns. You usually don't want to exclude this. | +| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. | | `attrs` | The attributes to set. You usually don't want to exclude this. | | `indices` | The token indices. You usually don't want to exclude this. | diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index ade62e3db..32d73d762 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -81,9 +81,9 @@ $ python -m spacy info [model] [--markdown] [--silent] Find all trained pipeline packages installed in the current environment and check whether they are compatible with the currently installed version of spaCy. Should be run after upgrading spaCy via `pip install -U spacy` to ensure that -all installed packages are can be used with the new version. It will show a list -of packages and their installed versions. If any package is out of date, the -latest compatible versions and command for updating are shown. +all installed packages can be used with the new version. It will show a list of +packages and their installed versions. If any package is out of date, the latest +compatible versions and command for updating are shown. > #### Automated validation > @@ -170,38 +170,69 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff] | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | Complete and auto-filled config file for training. | -### init vocab {#init-vocab new="3" tag="command"} +### init vectors {#init-vectors new="3" tag="command"} -Create a blank pipeline directory from raw data, like word frequencies, Brown -clusters and word vectors. Note that in order to populate the vocabulary, you -need to pass in a JSONL-formatted -[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional -`id` values that correspond to the vectors table. Just loading in vectors will -not automatically populate the vocab. +Convert [word vectors](/usage/linguistic-features#vectors-similarity) for use +with spaCy. Will export an `nlp` object that you can use in the +[`[initialize]`](/api/data-formats#config-initialize) block of your config to +initialize a model with vectors. See the usage guide on +[static vectors](/usage/embeddings-transformers#static-vectors) for details on +how to use vectors in your model. -This command was previously called `init-model`. +This functionality was previously available as part of the command `init-model`. ```cli -$ python -m spacy init vocab [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] [--vectors-name] [--meta-name] [--base] +$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose] ``` -| Name | Description | -| ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ | -| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | -| `--jsonl-loc`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ | -| `--vectors-loc`, `-v` | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ | -| `--truncate-vectors`, `-t` 2.3 | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | -| `--prune-vectors`, `-V` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | -| `--vectors-name`, `-vn` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ | -| `--meta-name`, `-mn` | Optional name of the package for the pipeline meta. ~~Optional[str] \(option)~~ | -| `--base`, `-b` | Optional name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers). ~~Optional[str] \(option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | +| Name | Description | +| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ | +| `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | +| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | +| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | +| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | +| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ | +| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | + +### init labels {#init-labels new="3" tag="command"} + +Generate JSON files for the labels in the data. This helps speed up the training +process, since spaCy won't have to preprocess the data to extract the labels. +After generating the labels, you can provide them to components that accept a +`labels` argument on initialization via the +[`[initialize]`](/api/data-formats#config-initialize) block of your config. + +> #### Example config +> +> ```ini +> [initialize.components.ner] +> +> [initialize.components.ner.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/ner.json +> ``` + +```cli +$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides] +``` + +| Name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | +| **CREATES** | The final trained pipeline and the best trained pipeline. | ## convert {#convert tag="command"} @@ -408,7 +439,7 @@ File /path/to/thinc/thinc/schedules.py (line 91) ### debug data {#debug-data tag="command"} -Analyze, debug, and validate your training and development data. Get useful +Analyze, debug and validate your training and development data. Get useful stats, and find problems like invalid entity annotations, cyclic dependencies, low data labels and more. @@ -436,6 +467,7 @@ $ python -m spacy debug data [config_path] [--code] [--ignore-warnings] [--verbo ``` =========================== Data format validation =========================== ✔ Corpus is loadable +✔ Pipeline can be initialized with data =============================== Training stats =============================== Training pipeline: tagger, parser, ner @@ -465,7 +497,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' ✔ No entities consisting of or starting/ending with whitespace =========================== Part-of-speech Tagging =========================== -ℹ 49 labels in data (57 labels in tag map) +ℹ 49 labels in data 'NN' (266331), 'IN' (227365), 'DT' (185600), 'NNP' (164404), 'JJ' (119830), 'NNS' (110957), '.' (101482), ',' (92476), 'RB' (90090), 'PRP' (90081), 'VB' (74538), 'VBD' (68199), 'CC' (62862), 'VBZ' (50712), 'VBP' (43420), 'VBN' @@ -476,7 +508,6 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' '-RRB-' (2825), '-LRB-' (2788), 'PDT' (2078), 'XX' (1316), 'RBS' (1142), 'FW' (794), 'NFP' (557), 'SYM' (440), 'WP$' (294), 'LS' (293), 'ADD' (191), 'AFX' (24) -✔ All labels present in tag map for language 'en' ============================= Dependency Parsing ============================= ℹ Found 111703 sentences with an average length of 18.6 words. @@ -826,17 +857,18 @@ skew. To render a sample of dependency parses in a HTML file using the $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] ``` -| Name | Description | -| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | -| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | -| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | -| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Training results and optional metrics and visualizations. | +| Name | Description | +| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | +| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | +| `--code-path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | +| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Training results and optional metrics and visualizations. | ## package {#package tag="command"} diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index e7d6773e6..986c6f458 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -7,9 +7,11 @@ new: 3 --- This class manages annotated corpora and can be used for training and -development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To +development datasets in the [`DocBin`](/api/docbin) (`.spacy`) format. To customize the data loading during training, you can register your own -[data readers and batchers](/usage/training#custom-code-readers-batchers). +[data readers and batchers](/usage/training#custom-code-readers-batchers). Also +see the usage guide on [data utilities](/usage/training#data) for more details +and examples. ## Config and implementation {#config} @@ -32,14 +34,16 @@ streaming. > gold_preproc = false > max_length = 0 > limit = 0 +> augmenter = null > ``` -| Name | Description | -| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ | -|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | -| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | -| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | +| Name | Description | +| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ | +|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | +| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | +| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | +| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/training/corpus.py @@ -74,7 +78,7 @@ train/test skew. |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ | | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | -| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ +| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ | ## Corpus.\_\_call\_\_ {#call tag="method"} @@ -96,7 +100,7 @@ Yield examples from the data. | `nlp` | The current `nlp` object. ~~Language~~ | | **YIELDS** | The examples. ~~Example~~ | -## JsonlTexts {#jsonltexts tag="class"} +## JsonlCorpus {#jsonlcorpus tag="class"} Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON) formatted raw text files. Can be used to read the raw text corpus for language @@ -122,22 +126,22 @@ file. {"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."} ``` -### JsonlTexts.\_\init\_\_ {#jsonltexts-init tag="method"} +### JsonlCorpus.\_\init\_\_ {#jsonlcorpus tag="method"} Initialize the reader. > #### Example > > ```python -> from spacy.training import JsonlTexts +> from spacy.training import JsonlCorpus > -> corpus = JsonlTexts("./data/texts.jsonl") +> corpus = JsonlCorpus("./data/texts.jsonl") > ``` > > ```ini > ### Example config > [corpora.pretrain] -> @readers = "spacy.JsonlReader.v1" +> @readers = "spacy.JsonlCorpus.v1" > path = "corpus/raw_text.jsonl" > min_length = 0 > max_length = 0 @@ -152,17 +156,17 @@ Initialize the reader. | `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | -### JsonlTexts.\_\_call\_\_ {#jsonltexts-call tag="method"} +### JsonlCorpus.\_\_call\_\_ {#jsonlcorpus-call tag="method"} Yield examples from the data. > #### Example > > ```python -> from spacy.training import JsonlTexts +> from spacy.training import JsonlCorpus > import spacy > -> corpus = JsonlTexts("./texts.jsonl") +> corpus = JsonlCorpus("./texts.jsonl") > nlp = spacy.blank("en") > data = corpus(nlp) > ``` diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 6ff3bfd0d..c1b9bfef4 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -135,7 +135,7 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy > path = ${paths:dev} > > [corpora.pretrain] -> @readers = "spacy.JsonlReader.v1" +> @readers = "spacy.JsonlCorpus.v1" > path = ${paths.raw} > > [corpora.my_custom_data] @@ -146,7 +146,7 @@ This section defines a **dictionary** mapping of string keys to functions. Each function takes an `nlp` object and yields [`Example`](/api/example) objects. By default, the two keys `train` and `dev` are specified and each refer to a [`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain` -section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader). +section is added that defaults to a [`JsonlCorpus`](/api/top-level#JsonlCorpus). You can also register custom functions that return a callable. | Name | Description | @@ -190,8 +190,6 @@ process that are used when you run [`spacy train`](/api/cli#train). | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | | `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | | `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | | `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | | `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | | `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | @@ -200,7 +198,6 @@ process that are used when you run [`spacy train`](/api/cli#train). | `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | | `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | | `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | -| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -220,6 +217,36 @@ used when you run [`spacy pretrain`](/api/cli#pretrain). | `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | | `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | +### initialize {#config-initialize tag="section"} + +This config block lets you define resources for **initializing the pipeline**. +It's used by [`Language.initialize`](/api/language#initialize) and typically +called right before training (but not at runtime). The section allows you to +specify local file paths or custom functions to load data resources from, +without requiring them at runtime when you load the trained pipeline back in. + +> #### Example +> +> ```ini +> [initialize] +> vectors = "/path/to/vectors_nlp" +> init_tok2vec = "/path/to/pretrain.bin" +> +> [initialize_components] +> +> [initialize.components.my_component] +> data_path = "/path/to/component_data" +> ``` + +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | +| `tokenizer` | Additional arguments passed to the `initialize` method of the specified tokenizer. Can be used for languages like Chinese that depend on dictionaries or trained models for tokenization. If type annotations are available on the method, the config will be validated against them. The `initialize` method will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Any]~~ | +| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vectors`](/api/cli#init-vectors). Defaults to `null`. ~~Optional[str]~~ | +| `vocab_data` | Path to JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) to initialize vocabulary. ~~Optional[str]~~ | + ## Training data {#training} ### Binary training format {#binary-training new="3"} @@ -245,8 +272,8 @@ Typically, the extension for these binary files is `.spacy`, and they are used as input format for specifying a [training corpus](/api/corpus) and for spaCy's CLI [`train`](/api/cli#train) command. The built-in [`convert`](/api/cli#convert) command helps you convert spaCy's previous -[JSON format](#json-input) to the new binary format format. It also supports -conversion of the `.conllu` format used by the +[JSON format](#json-input) to the new binary format. It also supports conversion +of the `.conllu` format used by the [Universal Dependencies corpora](https://github.com/UniversalDependencies). ### JSON training format {#json-input tag="deprecated"} @@ -309,7 +336,7 @@ $ python -m spacy convert ./data.json ./output.spacy -Here's an example of dependencies, part-of-speech tags and names entities, taken +Here's an example of dependencies, part-of-speech tags and named entities, taken from the English Wall Street Journal portion of the Penn Treebank: ```json @@ -425,15 +452,20 @@ example = Example.from_dict(doc, gold_dict) ## Lexical data for vocabulary {#vocab-jsonl new="2"} -To populate a pipeline's vocabulary, you can use the -[`spacy init vocab`](/api/cli#init-vocab) command and load in a -[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one -lexical entry per line via the `--jsonl-loc` option. The first line defines the -language and vocabulary settings. All other lines are expected to be JSON -objects describing an individual lexeme. The lexical attributes will be then set -as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab` -command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the -lexical data. +This data file can be provided via the `vocab_data` setting in the +`[initialize]` block of the training config to pre-define the lexical data to +initialize the `nlp` object's vocabulary with. The file should contain one +lexical entry per line. The first line defines the language and vocabulary +settings. All other lines are expected to be JSON objects describing an +individual lexeme. The lexical attributes will be then set as attributes on +spaCy's [`Lexeme`](/api/lexeme#attributes) object. + +> #### Example config +> +> ```ini +> [initialize] +> vocab_data = "/path/to/vocab-data.jsonl" +> ``` ```python ### First line diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 8af4455d3..ea4b779c7 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -21,9 +21,9 @@ non-projective parses. The parser is trained using an **imitation learning objective**. It follows the actions predicted by the current weights, and at each state, determines which actions are compatible with the optimal parse that could be reached from the -current state. The weights such that the scores assigned to the set of optimal -actions is increased, while scores assigned to other actions are decreased. Note -that more than one action may be optimal for a given state. +current state. The weights are updated such that the scores assigned to the set +of optimal actions is increased, while scores assigned to other actions are +decreased. Note that more than one action may be optimal for a given state. ## Config and implementation {#config} @@ -140,31 +140,48 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## DependencyParser.begin_training {#begin_training tag="method"} +## DependencyParser.initialize {#initialize tag="method" new="3"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. + + + +This method was previously called `begin_training`. + + > #### Example > > ```python > parser = nlp.add_pipe("parser") -> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline) +> parser.initialize(lambda: [], nlp=nlp) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.parser] +> +> [initialize.components.parser.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/parser.json > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | ## DependencyParser.predict {#predict tag="method"} @@ -210,7 +227,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and > > ```python > parser = nlp.add_pipe("parser") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = parser.update(examples, sgd=optimizer) > ``` @@ -294,11 +311,10 @@ context, the original parameters are restored. ## DependencyParser.add_label {#add_label tag="method"} Add a new label to the pipe. Note that you don't have to call this method if you -provide a **representative data sample** to the -[`begin_training`](#begin_training) method. In this case, all labels found in -the sample will be automatically added to the model, and the output dimension -will be [inferred](/usage/layers-architectures#thinc-shape-inference) -automatically. +provide a **representative data sample** to the [`initialize`](#initialize) +method. In this case, all labels found in the sample will be automatically added +to the model, and the output dimension will be +[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. > #### Example > diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index e10d9d077..d511dc889 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -31,21 +31,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the > doc = Doc(nlp.vocab, words=words, spaces=spaces) > ``` -| Name | Description | -| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | A storage container for lexical types. ~~Vocab~~ | -| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | -| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | -| _keyword-only_ | | -| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | -| `tags` 3 | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `pos` 3 | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `morphs` 3 | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `lemmas` 3 | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `heads` 3 | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ | -| `deps` 3 | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | -| `sent_starts` 3 | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ | -| `ents` 3 | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ | +| Name | Description | +| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | A storage container for lexical types. ~~Vocab~~ | +| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | +| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | +| _keyword-only_ | | +| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | +| `tags` 3 | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| `pos` 3 | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| `morphs` 3 | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| `lemmas` 3 | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| `heads` 3 | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ | +| `deps` 3 | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | +| `sent_starts` 3 | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ | +| `ents` 3 | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ | ## Doc.\_\_getitem\_\_ {#getitem tag="method"} @@ -503,8 +503,9 @@ invalidated, although they may accidentally continue to work. Mark a span for merging. The `attrs` will be applied to the resulting token (if they're context-dependent token attributes like `LEMMA` or `DEP`) or to the underlying lexeme (if they're context-independent lexical attributes like -`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a -dictionary mapping attribute names to values as the `"_"` key. +`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided +using the `"_"` key and specifying a dictionary that maps attribute names to +values. > #### Example > diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index 03aff2f6e..3625ed790 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -47,7 +47,7 @@ Create a `DocBin` object to hold serialized annotations. | Argument | Description | | ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `attrs` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. ~~Iterable[str]~~ | -| `store_user_data` | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. ~~bool~~ | +| `store_user_data` | Whether to write the `Doc.user_data` and the values of custom extension attributes to file/bytes. Defaults to `False`. ~~bool~~ | | `docs` | `Doc` objects to add on initialization. ~~Iterable[Doc]~~ | ## DocBin.\_\len\_\_ {#len tag="method"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 945a1568a..169a175e2 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -94,7 +94,7 @@ providing custom registered functions. ## EntityLinker.\_\_call\_\_ {#call tag="method"} -Apply the pipe to one document. The document is modified in place, and returned. +Apply the pipe to one document. The document is modified in place and returned. This usually happens under the hood when the `nlp` object is called on a text and all pipeline components are applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe) @@ -139,31 +139,35 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## EntityLinker.begin_training {#begin_training tag="method"} +## EntityLinker.initialize {#initialize tag="method" new="3"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). + + + +This method was previously called `begin_training`. + + > #### Example > > ```python -> entity_linker = nlp.add_pipe("entity_linker", last=True) -> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline) +> entity_linker = nlp.add_pipe("entity_linker") +> entity_linker.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## EntityLinker.predict {#predict tag="method"} @@ -211,7 +215,7 @@ pipe's entity linking model and context encoder. Delegates to > > ```python > entity_linker = nlp.add_pipe("entity_linker") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = entity_linker.update(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 6d710f425..5fbd0b229 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]] | +| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | @@ -83,7 +83,7 @@ shortcut for this and instantiate the component using its string name and ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} -Apply the pipe to one document. The document is modified in place, and returned. +Apply the pipe to one document. The document is modified in place and returned. This usually happens under the hood when the `nlp` object is called on a text and all pipeline components are applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and @@ -129,31 +129,48 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## EntityRecognizer.begin_training {#begin_training tag="method"} +## EntityRecognizer.initialize {#initialize tag="method" new="3"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. + + + +This method was previously called `begin_training`. + + > #### Example > > ```python > ner = nlp.add_pipe("ner") -> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline) +> ner.initialize(lambda: [], nlp=nlp) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.ner] +> +> [initialize.components.ner.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/ner.json > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | ## EntityRecognizer.predict {#predict tag="method"} @@ -199,7 +216,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and > > ```python > ner = nlp.add_pipe("ner") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = ner.update(examples, sgd=optimizer) > ``` @@ -282,11 +299,10 @@ context, the original parameters are restored. ## EntityRecognizer.add_label {#add_label tag="method"} Add a new label to the pipe. Note that you don't have to call this method if you -provide a **representative data sample** to the -[`begin_training`](#begin_training) method. In this case, all labels found in -the sample will be automatically added to the model, and the output dimension -will be [inferred](/usage/layers-architectures#thinc-shape-inference) -automatically. +provide a **representative data sample** to the [`initialize`](#initialize) +method. In this case, all labels found in the sample will be automatically added +to the model, and the output dimension will be +[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. > #### Example > diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 7be44bc95..7b7e5b635 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -256,6 +256,6 @@ Get all patterns that were added to the entity ruler. | Name | Description | | ----------------- | --------------------------------------------------------------------------------------------------------------------- | | `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ | -| `phrase_matcher` | The underlying phrase matcher, used to process phrase patterns. ~~PhraseMatcher~~ | +| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ | | `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ | | `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ | diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 668c8028f..2811f4d91 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -33,8 +33,8 @@ both documents. | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------ | -| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ | -| `reference` | The document containing gold-standard annotations. Can not be `None`. ~~Doc~~ | +| `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ | +| `reference` | The document containing gold-standard annotations. Cannot be `None`. ~~Doc~~ | | _keyword-only_ | | | `alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ | @@ -58,8 +58,8 @@ see the [training format documentation](/api/data-formats#dict-input). | Name | Description | | -------------- | ------------------------------------------------------------------------- | -| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ | -| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. ~~Dict[str, Any]~~ | +| `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ | +| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Cannot be `None`. ~~Dict[str, Any]~~ | | **RETURNS** | The newly constructed object. ~~Example~~ | ## Example.text {#text tag="property"} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index dd3cc57dd..6257199c9 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -8,8 +8,8 @@ source: spacy/language.py Usually you'll load this once per process as `nlp` and pass the instance around your application. The `Language` class is created when you call [`spacy.load`](/api/top-level#spacy.load) and contains the shared vocabulary and -[language data](/usage/adding-languages), optional binary weights, e.g. provided -by a [trained pipeline](/models), and the +[language data](/usage/linguistic-features#language-data), optional binary +weights, e.g. provided by a [trained pipeline](/models), and the [processing pipeline](/usage/processing-pipelines) containing components like the tagger or parser that are called on a document in order. You can also add your own processing pipeline components that take a `Doc` object, modify it and @@ -46,10 +46,11 @@ information in [`Language.meta`](/api/language#meta) and not to configure the ## Language.from_config {#from_config tag="classmethod" new="3"} Create a `Language` object from a loaded config. Will set up the tokenizer and -language data, add pipeline components based on the pipeline and components -define in the config and validate the results. If no config is provided, the -default config of the given language is used. This is also how spaCy loads a -model under the hood based on its [`config.cfg`](/api/data-formats#config). +language data, add pipeline components based on the pipeline and add pipeline +components based on the definitions specified in the config. If no config is +provided, the default config of the given language is used. This is also how +spaCy loads a model under the hood based on its +[`config.cfg`](/api/data-formats#config). > #### Example > @@ -107,7 +108,7 @@ decorator. For more details and examples, see the | `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | -| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[Doc], Doc]]~~ | +| `func` | Optional function if not used as a decorator. ~~Optional[Callable[[Doc], Doc]]~~ | ## Language.factory {#factory tag="classmethod"} @@ -154,7 +155,7 @@ examples, see the | `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ | -| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | +| `func` | Optional function if not used as a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | ## Language.\_\_call\_\_ {#call tag="method"} @@ -201,30 +202,40 @@ more efficient than processing texts one-by-one. | `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ | | **YIELDS** | Documents in the order of the original text. ~~Doc~~ | -## Language.begin_training {#begin_training tag="method"} +## Language.initialize {#initialize tag="method" new="3"} Initialize the pipeline for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples can either be the full training data or a representative sample. They -are used to **initialize the models** of trainable pipeline components and are -passed each component's [`begin_training`](/api/pipe#begin_training) method, if -available. Initialization includes validating the network, +[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the +settings defined in the [`[initialize]`](/api/data-formats#config-initialize) +config block to set up the vocabulary, load in vectors and tok2vec weights and +pass optional arguments to the `initialize` methods implemented by pipeline +components or the tokenizer. This method is typically called automatically when +you run [`spacy train`](/api/cli#train). See the usage guide on the +[config lifecycle](/usage/training#config-lifecycle) and +[initialization](/usage/training#initialization) for details. + +`get_examples` should be a function that returns an iterable of +[`Example`](/api/example) objects. The data examples can either be the full +training data or a representative sample. They are used to **initialize the +models** of trainable pipeline components and are passed each component's +[`initialize`](/api/pipe#initialize) method, if available. Initialization +includes validating the network, [inferring missing shapes](/usage/layers-architectures#thinc-shape-inference) and setting up the label scheme based on the data. -If no `get_examples` function is provided when calling `nlp.begin_training`, the +If no `get_examples` function is provided when calling `nlp.initialize`, the pipeline components will be initialized with generic data. In this case, it is crucial that the output dimension of each component has already been defined either in the [config](/usage/training#config), or by calling [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for the tagger or textcat). - + -The `Language.update` method now takes a **function** that is called with no -arguments and returns a sequence of [`Example`](/api/example) objects instead of -tuples of `Doc` and `GoldParse` objects. +This method was previously called `begin_training`. It now also takes a +**function** that is called with no arguments and returns a sequence of +[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse` +objects. @@ -232,7 +243,7 @@ tuples of `Doc` and `GoldParse` objects. > > ```python > get_examples = lambda: examples -> optimizer = nlp.begin_training(get_examples) +> optimizer = nlp.initialize(get_examples) > ``` | Name | Description | @@ -601,7 +612,7 @@ does nothing. ## Language.enable_pipe {#enable_pipe tag="method" new="3"} -Enable a previously disable component (e.g. via +Enable a previously disabled component (e.g. via [`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is already enabled, this method does nothing. @@ -628,7 +639,7 @@ pipeline will be restored to the initial state at the end of the block. Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method you can use to undo your changes. You can specify either `disable` (as a list or string), or `enable`. In the latter case, all components not in the `enable` -list, will be disabled. Under the hood, this method calls into +list will be disabled. Under the hood, this method calls into [`disable_pipe`](/api/language#disable_pipe) and [`enable_pipe`](/api/language#enable_pipe). @@ -636,13 +647,13 @@ list, will be disabled. Under the hood, this method calls into > > ```python > with nlp.select_pipes(disable=["tagger", "parser"]): -> nlp.begin_training() +> nlp.initialize() > > with nlp.select_pipes(enable="ner"): -> nlp.begin_training() +> nlp.initialize() > > disabled = nlp.select_pipes(disable=["tagger", "parser"]) -> nlp.begin_training() +> nlp.initialize() > disabled.restore() > ``` @@ -661,7 +672,7 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: | -------------- | ------------------------------------------------------------------------------------------------------ | | _keyword-only_ | | | `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ | -| `enable` | Names(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ | +| `enable` | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ | | **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ | ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"} @@ -873,10 +884,10 @@ Loads state from a directory, including all data that was saved with the -Keep in mind that this method **only loads serialized state** and doesn't set up -the `nlp` object. This means that it requires the correct language class to be -initialized and all pipeline components to be added to the pipeline. If you want -to load a serialized pipeline from a directory, you should use +Keep in mind that this method **only loads the serialized state** and doesn't +set up the `nlp` object. This means that it requires the correct language class +to be initialized and all pipeline components to be added to the pipeline. If +you want to load a serialized pipeline from a directory, you should use [`spacy.load`](/api/top-level#spacy.load), which will set everything up for you. @@ -919,7 +930,7 @@ Serialize the current state to a binary string. Load state from a binary string. Note that this method is commonly used via the subclasses like `English` or `German` to make language-specific functionality -like the [lexical attribute getters](/usage/adding-languages#lex-attrs) +like the [lexical attribute getters](/usage/linguistic-features#language-data) available to the loaded object. > #### Example diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index f9978dcf9..27ea04432 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -38,7 +38,7 @@ The default config is defined by the pipeline component factory and describes how the component should be configured. You can override its settings via the `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your [`config.cfg` for training](/usage/training#config). For examples of the lookups -data formats used by the lookup and rule-based lemmatizers, see +data format used by the lookup and rule-based lemmatizers, see [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). > #### Example @@ -48,12 +48,11 @@ data formats used by the lookup and rule-based lemmatizers, see > nlp.add_pipe("lemmatizer", config=config) > ``` -| Setting | Description | -| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | -| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ | -| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | -| `model` | **Not yet implemented:** the model to use. ~~Model~~ | +| Setting | Description | +| ----------- | --------------------------------------------------------------------------------- | +| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | +| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | +| `model` | **Not yet implemented:** the model to use. ~~Model~~ | ```python %%GITHUB_SPACY/spacy/pipeline/lemmatizer.py @@ -76,15 +75,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | **Not yet implemented:** The model to use. ~~Model~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | -| lookups | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ | -| overwrite | Whether to overwrite existing lemmas. ~~bool~ | +| Name | Description | +| -------------- | --------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | **Not yet implemented:** The model to use. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | +| overwrite | Whether to overwrite existing lemmas. ~~bool~ | ## Lemmatizer.\_\_call\_\_ {#call tag="method"} @@ -127,11 +125,41 @@ applied to the `Doc` in order. | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | +## Lemmatizer.initialize {#initialize tag="method"} + +Initialize the lemmatizer and load any data resources. This method is typically +called by [`Language.initialize`](/api/language#initialize) and lets you +customize arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. The loading only happens during initialization, typically before +training. At runtime, all data is loaded from disk. + +> #### Example +> +> ```python +> lemmatizer = nlp.add_pipe("lemmatizer") +> lemmatizer.initialize(lookups=lookups) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.lemmatizer] +> +> [initialize.components.lemmatizer.lookups] +> @misc = "load_my_lookups.v1" +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ | + ## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"} Lemmatize a token using a lookup-based approach. If no lemma is found, the -original string is returned. Languages can provide a -[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`. +original string is returned. | Name | Description | | ----------- | --------------------------------------------------- | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 3f7076a1c..81c2a8515 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -61,7 +61,7 @@ matched: | `!` | Negate the pattern, by requiring it to match exactly 0 times. | | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. | | `+` | Require the pattern to match 1 or more times. | -| `*` | Allow the pattern to match zero or more times. | +| `*` | Allow the pattern to match 0 or more times. | Token patterns can also map to a **dictionary of properties** instead of a single value to indicate whether the expected value is a member of a list or how diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index e1a166474..50e2bb33a 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -117,32 +117,42 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Morphologizer.begin_training {#begin_training tag="method"} +## Morphologizer.initialize {#initialize tag="method"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. > #### Example > > ```python > morphologizer = nlp.add_pipe("morphologizer") -> nlp.pipeline.append(morphologizer) -> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline) +> morphologizer.initialize(lambda: [], nlp=nlp) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.morphologizer] +> +> [initialize.components.morphologizer.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/morphologizer.json > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | ## Morphologizer.predict {#predict tag="method"} @@ -189,7 +199,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and > > ```python > morphologizer = nlp.add_pipe("morphologizer") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = morphologizer.update(examples, sgd=optimizer) > ``` @@ -259,12 +269,11 @@ context, the original parameters are restored. Add a new label to the pipe. If the `Morphologizer` should set annotations for both `pos` and `morph`, the label should include the UPOS as the feature `POS`. Raises an error if the output dimension is already set, or if the model has -already been fully [initialized](#begin_training). Note that you don't have to -call this method if you provide a **representative data sample** to the -[`begin_training`](#begin_training) method. In this case, all labels found in -the sample will be automatically added to the model, and the output dimension -will be [inferred](/usage/layers-architectures#thinc-shape-inference) -automatically. +already been fully [initialized](#initialize). Note that you don't have to call +this method if you provide a **representative data sample** to the +[`initialize`](#initialize) method. In this case, all labels found in the sample +will be automatically added to the model, and the output dimension will be +[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. > #### Example > diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index 5d5324061..e64f26bdd 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -12,7 +12,7 @@ container storing a single morphological analysis. ## Morphology.\_\_init\_\_ {#init tag="method"} -Create a Morphology object. +Create a `Morphology` object. > #### Example > @@ -101,7 +101,7 @@ representation. | Name | Description | | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- | | `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ | -| **RETURNS** | The morphological features as in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | +| **RETURNS** | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | ## Attributes {#attributes} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index e4e1e97f1..4f5ac6f61 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -98,31 +98,35 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Pipe.begin_training {#begin_training tag="method"} +## Pipe.initialize {#initialize tag="method" new="3"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). + + + +This method was previously called `begin_training`. + + > #### Example > > ```python > pipe = nlp.add_pipe("your_custom_pipe") -> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline) +> pipe.initialize(lambda: [], pipeline=nlp.pipeline) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## Pipe.predict {#predict tag="method"} @@ -180,7 +184,7 @@ predictions and gold-standard annotations, and update the component's model. > > ```python > pipe = nlp.add_pipe("your_custom_pipe") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = pipe.update(examples, sgd=optimizer) > ``` @@ -296,9 +300,9 @@ context, the original parameters are restored. Add a new label to the pipe, to be predicted by the model. The actual implementation depends on the specific component, but in general `add_label` shouldn't be called if the output dimension is already set, or if the model has -already been fully [initialized](#begin_training). If these conditions are -violated, the function will raise an Error. The exception to this rule is when -the component is [resizable](#is_resizable), in which case +already been fully [initialized](#initialize). If these conditions are violated, +the function will raise an Error. The exception to this rule is when the +component is [resizable](#is_resizable), in which case [`set_output`](#set_output) should be called to ensure that the model is properly resized. @@ -314,9 +318,9 @@ This method needs to be overwritten with your own custom `add_label` method. | **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ | Note that in general, you don't have to call `pipe.add_label` if you provide a -representative data sample to the [`begin_training`](#begin_training) method. In -this case, all labels found in the sample will be automatically added to the -model, and the output dimension will be +representative data sample to the [`initialize`](#initialize) method. In this +case, all labels found in the sample will be automatically added to the model, +and the output dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference) automatically. ## Pipe.is_resizable {#is_resizable tag="method"} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index acf94fb8e..fced37fd3 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -114,31 +114,29 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## SentenceRecognizer.begin_training {#begin_training tag="method"} +## SentenceRecognizer.initialize {#initialize tag="method"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). > #### Example > > ```python > senter = nlp.add_pipe("senter") -> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline) +> senter.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## SentenceRecognizer.predict {#predict tag="method"} @@ -185,7 +183,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and > > ```python > senter = nlp.add_pipe("senter") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = senter.update(examples, sgd=optimizer) > ``` @@ -202,7 +200,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and ## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"} Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the -current model to make predictions similar to an initial model, to try to address +current model to make predictions similar to an initial model to try to address the "catastrophic forgetting" problem. This feature is experimental. > #### Example diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index ae31e4ddf..594a85f74 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -8,7 +8,7 @@ api_string_name: sentencizer api_trainable: false --- -A simple pipeline component, to allow custom sentence boundary detection logic +A simple pipeline component to allow custom sentence boundary detection logic that doesn't require the dependency parse. By default, sentence segmentation is performed by the [`DependencyParser`](/api/dependencyparser), so the `Sentencizer` lets you implement a simpler, rule-based strategy that doesn't @@ -130,7 +130,7 @@ Score a batch of examples. ## Sentencizer.to_disk {#to_disk tag="method"} -Save the sentencizer settings (punctuation characters) a directory. Will create +Save the sentencizer settings (punctuation characters) to a directory. Will create a file `sentencizer.json`. This also happens automatically when you save an `nlp` object with a sentencizer added to its pipeline. diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 1c7bc9592..7fa1aaa38 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -8,7 +8,7 @@ A slice from a [`Doc`](/api/doc) object. ## Span.\_\_init\_\_ {#init tag="method"} -Create a Span object from the slice `doc[start : end]`. +Create a `Span` object from the slice `doc[start : end]`. > #### Example > @@ -187,7 +187,7 @@ the character indices don't map to a valid span. | Name | Description | | ------------------------------------ | ----------------------------------------------------------------------------------------- | | `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `kb_id` 2.2 | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index d428d376e..d7c56be67 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -112,31 +112,48 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Tagger.begin_training {#begin_training tag="method"} +## Tagger.initialize {#initialize tag="method" new="3"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. + + + +This method was previously called `begin_training`. + + > #### Example > > ```python > tagger = nlp.add_pipe("tagger") -> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline) +> tagger.initialize(lambda: [], nlp=nlp) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.tagger] +> +> [initialize.components.tagger.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/tagger.json > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ | ## Tagger.predict {#predict tag="method"} @@ -183,7 +200,7 @@ Delegates to [`predict`](/api/tagger#predict) and > > ```python > tagger = nlp.add_pipe("tagger") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = tagger.update(examples, sgd=optimizer) > ``` @@ -289,12 +306,12 @@ context, the original parameters are restored. ## Tagger.add_label {#add_label tag="method"} Add a new label to the pipe. Raises an error if the output dimension is already -set, or if the model has already been fully [initialized](#begin_training). Note +set, or if the model has already been fully [initialized](#initialize). Note that you don't have to call this method if you provide a **representative data -sample** to the [`begin_training`](#begin_training) method. In this case, all -labels found in the sample will be automatically added to the model, and the -output dimension will be -[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. +sample** to the [`initialize`](#initialize) method. In this case, all labels +found in the sample will be automatically added to the model, and the output +dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference) +automatically. > #### Example > diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index b68039094..dd8c81040 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -125,35 +125,52 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## TextCategorizer.begin_training {#begin_training tag="method"} +## TextCategorizer.initialize {#initialize tag="method" new="3"} -Initialize the component for training and return an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a -function that returns an iterable of [`Example`](/api/example) objects. The data -examples are used to **initialize the model** of the component and can either be -the full training data or a representative sample. Initialization includes -validating the network, +Initialize the component for training. `get_examples` should be a function that +returns an iterable of [`Example`](/api/example) objects. The data examples are +used to **initialize the model** of the component and can either be the full +training data or a representative sample. Initialization includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. + + + +This method was previously called `begin_training`. + + > #### Example > > ```python > textcat = nlp.add_pipe("textcat") -> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline) +> textcat.initialize(lambda: [], nlp=nlp) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.textcat] +> +> [initialize.components.textcat.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/textcat.json > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | ## TextCategorizer.predict {#predict tag="method"} -Apply the component's model to a batch of [`Doc`](/api/doc) objects, without +Apply the component's model to a batch of [`Doc`](/api/doc) objects without modifying them. > #### Example @@ -170,7 +187,7 @@ modifying them. ## TextCategorizer.set_annotations {#set_annotations tag="method"} -Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. +Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores. > #### Example > @@ -196,14 +213,14 @@ Delegates to [`predict`](/api/textcategorizer#predict) and > > ```python > textcat = nlp.add_pipe("textcat") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = textcat.update(examples, sgd=optimizer) > ``` | Name | Description | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | +| _keyword-only_ | | | `drop` | The dropout rate. ~~float~~ | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | @@ -213,7 +230,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and ## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"} Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the -current model to make predictions similar to an initial model, to try to address +current model to make predictions similar to an initial model to try to address the "catastrophic forgetting" problem. This feature is experimental. > #### Example @@ -227,7 +244,7 @@ the "catastrophic forgetting" problem. This feature is experimental. | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------ | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | +| _keyword-only_ | | | `drop` | The dropout rate. ~~float~~ | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | @@ -286,7 +303,7 @@ Create an optimizer for the pipeline component. ## TextCategorizer.use_params {#use_params tag="method, contextmanager"} -Modify the pipe's model, to use the given parameter values. +Modify the pipe's model to use the given parameter values. > #### Example > @@ -303,12 +320,12 @@ Modify the pipe's model, to use the given parameter values. ## TextCategorizer.add_label {#add_label tag="method"} Add a new label to the pipe. Raises an error if the output dimension is already -set, or if the model has already been fully [initialized](#begin_training). Note +set, or if the model has already been fully [initialized](#initialize). Note that you don't have to call this method if you provide a **representative data -sample** to the [`begin_training`](#begin_training) method. In this case, all -labels found in the sample will be automatically added to the model, and the -output dimension will be -[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. +sample** to the [`initialize`](#initialize) method. In this case, all labels +found in the sample will be automatically added to the model, and the output +dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference) +automatically. > #### Example > diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 5c7214edc..051164ff5 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods. | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Tok2Vec.begin_training {#begin_training tag="method"} +## Tok2Vec.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -132,26 +132,25 @@ examples are used to **initialize the model** of the component and can either be the full training data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). > #### Example > > ```python > tok2vec = nlp.add_pipe("tok2vec") -> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline) +> tok2vec.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## Tok2Vec.predict {#predict tag="method"} -Apply the component's model to a batch of [`Doc`](/api/doc) objects, without +Apply the component's model to a batch of [`Doc`](/api/doc) objects without modifying them. > #### Example @@ -193,7 +192,7 @@ Delegates to [`predict`](/api/tok2vec#predict). > > ```python > tok2vec = nlp.add_pipe("tok2vec") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = tok2vec.update(examples, sgd=optimizer) > ``` @@ -224,7 +223,7 @@ Create an optimizer for the pipeline component. ## Tok2Vec.use_params {#use_params tag="method, contextmanager"} -Modify the pipe's model, to use the given parameter values. At the end of the +Modify the pipe's model to use the given parameter values. At the end of the context, the original parameters are restored. > #### Example diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 0860797aa..e7e66e931 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -172,6 +172,25 @@ Get a neighboring token. | `i` | The relative position of the token to get. Defaults to `1`. ~~int~~ | | **RETURNS** | The token at position `self.doc[self.i+i]`. ~~Token~~ | +## Token.set_morph {#set_morph tag="method"} + +Set the morphological analysis from a UD FEATS string, hash value of a UD FEATS +string, features dict or `MorphAnalysis`. The value `None` can be used to reset +the morph to an unset state. + +> #### Example +> +> ```python +> doc = nlp("Give it back! He pleaded.") +> doc[0].set_morph("Mood=Imp|VerbForm=Fin") +> assert "Mood=Imp" in doc[0].morph +> assert doc[0].morph.get("Mood") == ["Imp"] +> ``` + +| Name | Description | +| -------- | --------------------------------------------------------------------------------- | +| features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ | + ## Token.is_ancestor {#is_ancestor tag="method" model="parser"} Check whether this token is a parent, grandparent, etc. of another in the @@ -243,7 +262,7 @@ A sequence of the token's immediate syntactic children. ## Token.lefts {#lefts tag="property" model="parser"} -The leftward immediate children of the word, in the syntactic dependency parse. +The leftward immediate children of the word in the syntactic dependency parse. > #### Example > @@ -259,7 +278,7 @@ The leftward immediate children of the word, in the syntactic dependency parse. ## Token.rights {#rights tag="property" model="parser"} -The rightward immediate children of the word, in the syntactic dependency parse. +The rightward immediate children of the word in the syntactic dependency parse. > #### Example > @@ -275,7 +294,7 @@ The rightward immediate children of the word, in the syntactic dependency parse. ## Token.n_lefts {#n_lefts tag="property" model="parser"} -The number of leftward immediate children of the word, in the syntactic +The number of leftward immediate children of the word in the syntactic dependency parse. > #### Example @@ -291,7 +310,7 @@ dependency parse. ## Token.n_rights {#n_rights tag="property" model="parser"} -The number of rightward immediate children of the word, in the syntactic +The number of rightward immediate children of the word in the syntactic dependency parse. > #### Example @@ -392,74 +411,73 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Description | -| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | The parent document. ~~Doc~~ | -| `lex` 3 | The underlying lexeme. ~~Lexeme~~ | -| `sent` 2.0.12 | The sentence span that this token is a part of. ~~Span~~ | -| `text` | Verbatim text content. ~~str~~ | -| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ | -| `whitespace_` | Trailing space character if present. ~~str~~ | -| `orth` | ID of the verbatim text content. ~~int~~ | -| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ | -| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ | -| `tensor` 2.1.7 | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | -| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ | -| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ | -| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ | -| `i` | The index of the token within the parent document. ~~int~~ | -| `ent_type` | Named entity type. ~~int~~ | -| `ent_type_` | Named entity type. ~~str~~ | -| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ | -| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ | -| `ent_kb_id` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ | -| `ent_kb_id_` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ | -| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ | -| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ | -| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ | -| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ | -| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~int~~ | -| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~ | -| `lower` | Lowercase form of the token. ~~int~~ | -| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ | -| `shape` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | -| `shape_` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | -| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ | -| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ | -| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ | -| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ | -| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ | -| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ | -| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ | -| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ | -| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ | -| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ | -| `is_punct` | Is the token punctuation? ~~bool~~ | -| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | -| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | -| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | -| `is_bracket` | Is the token a bracket? ~~bool~~ | -| `is_quote` | Is the token a quotation mark? ~~bool~~ | -| `is_currency` 2.0.8 | Is the token a currency symbol? ~~bool~~ | -| `like_url` | Does the token resemble a URL? ~~bool~~ | -| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ | -| `like_email` | Does the token resemble an email address? ~~bool~~ | -| `is_oov` | Does the token have a word vector? ~~bool~~ | -| `is_stop` | Is the token part of a "stop list"? ~~bool~~ | -| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ | -| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ | -| `tag` | Fine-grained part-of-speech. ~~int~~ | -| `tag_` | Fine-grained part-of-speech. ~~str~~ | -| `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | -| `morph_` 3 | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~ | -| `dep` | Syntactic dependency relation. ~~int~~ | -| `dep_` | Syntactic dependency relation. ~~str~~ | -| `lang` | Language of the parent document's vocabulary. ~~int~~ | -| `lang_` | Language of the parent document's vocabulary. ~~str~~ | -| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ | -| `idx` | The character offset of the token within the parent document. ~~int~~ | -| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ | -| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | -| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | -| `cluster` | Brown cluster ID. ~~int~~ | -| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | +| Name | Description | +| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | The parent document. ~~Doc~~ | +| `lex` 3 | The underlying lexeme. ~~Lexeme~~ | +| `sent` 2.0.12 | The sentence span that this token is a part of. ~~Span~~ | +| `text` | Verbatim text content. ~~str~~ | +| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ | +| `whitespace_` | Trailing space character if present. ~~str~~ | +| `orth` | ID of the verbatim text content. ~~int~~ | +| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ | +| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ | +| `tensor` 2.1.7 | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | +| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ | +| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ | +| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ | +| `i` | The index of the token within the parent document. ~~int~~ | +| `ent_type` | Named entity type. ~~int~~ | +| `ent_type_` | Named entity type. ~~str~~ | +| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ | +| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ | +| `ent_kb_id` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ | +| `ent_kb_id_` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ | +| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ | +| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ | +| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ | +| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ | +| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~ | +| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~ | +| `lower` | Lowercase form of the token. ~~int~~ | +| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ | +| `shape` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | +| `shape_` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | +| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ | +| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ | +| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ | +| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ | +| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ | +| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ | +| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ | +| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ | +| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ | +| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ | +| `is_punct` | Is the token punctuation? ~~bool~~ | +| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | +| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | +| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | +| `is_bracket` | Is the token a bracket? ~~bool~~ | +| `is_quote` | Is the token a quotation mark? ~~bool~~ | +| `is_currency` 2.0.8 | Is the token a currency symbol? ~~bool~~ | +| `like_url` | Does the token resemble a URL? ~~bool~~ | +| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ | +| `like_email` | Does the token resemble an email address? ~~bool~~ | +| `is_oov` | Does the token have a word vector? ~~bool~~ | +| `is_stop` | Is the token part of a "stop list"? ~~bool~~ | +| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ | +| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ | +| `tag` | Fine-grained part-of-speech. ~~int~~ | +| `tag_` | Fine-grained part-of-speech. ~~str~~ | +| `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | +| `dep` | Syntactic dependency relation. ~~int~~ | +| `dep_` | Syntactic dependency relation. ~~str~~ | +| `lang` | Language of the parent document's vocabulary. ~~int~~ | +| `lang_` | Language of the parent document's vocabulary. ~~str~~ | +| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ | +| `idx` | The character offset of the token within the parent document. ~~int~~ | +| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ | +| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `cluster` | Brown cluster ID. ~~int~~ | +| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 0158c5589..8809c10bc 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -1,6 +1,6 @@ --- title: Tokenizer -teaser: Segment text into words, punctuations marks etc. +teaser: Segment text into words, punctuations marks, etc. tag: class source: spacy/tokenizer.pyx --- @@ -15,16 +15,15 @@ source: spacy/tokenizer.pyx Segment text, and create `Doc` objects with the discovered segment boundaries. For a deeper understanding, see the docs on [how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works). -The tokenizer is typically created automatically when the a +The tokenizer is typically created automatically when a [`Language`](/api/language) subclass is initialized and it reads its settings like punctuation and special case rules from the [`Language.Defaults`](/api/language#defaults) provided by the language subclass. ## Tokenizer.\_\_init\_\_ {#init tag="method"} -Create a `Tokenizer`, to create `Doc` objects given unicode text. For examples -of how to construct a custom tokenizer with different tokenization rules, see -the +Create a `Tokenizer` to create `Doc` objects given unicode text. For examples of +how to construct a custom tokenizer with different tokenization rules, see the [usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers). > #### Example @@ -87,7 +86,7 @@ Tokenize a stream of texts. | ------------ | ------------------------------------------------------------------------------------ | | `texts` | A sequence of unicode texts. ~~Iterable[str]~~ | | `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ | -| **YIELDS** | The tokenized Doc objects, in order. ~~Doc~~ | +| **YIELDS** | The tokenized `Doc` objects, in order. ~~Doc~~ | ## Tokenizer.find_infix {#find_infix tag="method"} @@ -121,10 +120,10 @@ if no suffix rules match. ## Tokenizer.add_special_case {#add_special_case tag="method"} Add a special-case tokenization rule. This mechanism is also used to add custom -tokenizer exceptions to the language data. See the usage guide on -[adding languages](/usage/adding-languages#tokenizer-exceptions) and -[linguistic features](/usage/linguistic-features#special-cases) for more details -and examples. +tokenizer exceptions to the language data. See the usage guide on the +[languages data](/usage/linguistic-features#language-data) and +[tokenizer special cases](/usage/linguistic-features#special-cases) for more +details and examples. > #### Example > diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index f36be0806..d7273b651 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -7,7 +7,8 @@ menu: - ['Loggers', 'loggers'] - ['Readers', 'readers'] - ['Batchers', 'batchers'] - - ['Data & Alignment', 'gold'] + - ['Augmenters', 'augmenters'] + - ['Training & Alignment', 'gold'] - ['Utility Functions', 'util'] --- @@ -191,16 +192,16 @@ browser. Will run a simple web server. > displacy.serve([doc1, doc2], style="dep") > ``` -| Name | Description | -| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ | -| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ | -| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | -| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | -| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | -| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | -| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ | -| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ | +| Name | Description | +| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ | +| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ | +| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | +| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | +| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | +| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | +| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ | +| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ | ### displacy.render {#displacy.render tag="method" new="2"} @@ -223,7 +224,7 @@ Render a dependency parse tree or named entity visualization. | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | -| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | +| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | | `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ | | **RETURNS** | The rendered HTML markup. ~~str~~ | @@ -244,7 +245,7 @@ If a setting is not present in the options, the default value will be used. | Name | Description | | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | | `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | -| `add_lemma` 2.2.4 | Print the lemma's in a separate row below the token texts. Defaults to `False`. ~~bool~~ | +| `add_lemma` 2.2.4 | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | | `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | | `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | @@ -268,11 +269,11 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Description | -| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | -| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | -| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | +| Name | Description | +| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | +| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | By default, displaCy comes with colors for all entity types used by [spaCy's trained pipelines](/models). If you're using custom entity types, you @@ -313,6 +314,7 @@ factories. | Registry name | Description | | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | +| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. | | `batchers` | Registry for training and evaluation [data batchers](#batchers). | | `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | @@ -325,7 +327,7 @@ factories. | `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | | `misc` | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need. | | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | -| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). | +| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | | `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | @@ -370,7 +372,7 @@ results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers listed here, you can also [implement your own](/usage/training#custom-logging). -#### ConsoleLogger {#ConsoleLogger tag="registered function"} +#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"} > #### Example config > @@ -416,7 +418,7 @@ start decreasing across epochs. -#### WandbLogger {#WandbLogger tag="registered function"} +#### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"} > #### Installation > @@ -468,7 +470,65 @@ logging the results. -## Readers {#readers source="spacy/training/corpus.py" new="3"} +## Readers {#readers} + +### File readers {#file-readers source="github.com/explosion/srsly" new="3"} + +The following file readers are provided by our serialization library +[`srsly`](https://github.com/explosion/srsly). All registered functions take one +argument `path`, pointing to the file path to load. + +> #### Example config +> +> ```ini +> [corpora.train.augmenter.orth_variants] +> @readers = "srsly.read_json.v1" +> path = "corpus/en_orth_variants.json" +> ``` + +| Name | Description | +| ----------------------- | ----------------------------------------------------- | +| `srsly.read_json.v1` | Read data from a JSON file. | +| `srsly.read_jsonl.v1` | Read data from a JSONL (newline-delimited JSON) file. | +| `srsly.read_yaml.v1` | Read data from a YAML file. | +| `srsly.read_msgpack.v1` | Read data from a binary MessagePack file. | + + + +Since the file readers expect a local path, you should only use them in config +blocks that are **not executed at runtime** – for example, in `[training]` and +`[corpora]` (to load data or resources like data augmentation tables) or in +`[initialize]` (to pass data to pipeline components). + + + +#### spacy.read_labels.v1 {#read_labels tag="registered function"} + +Read a JSON-formatted labels file generated with +[`init labels`](/api/cli#init-labels). Typically used in the +[`[initialize]`](/api/data-formats#config-initialize) block of the training +config to speed up the model initialization process and provide pre-generated +label sets. + +> #### Example config +> +> ```ini +> [initialize.components] +> +> [initialize.components.ner] +> +> [initialize.components.ner.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/ner.json" +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~ | +| `require` | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ | +| **CREATES** | The | + +### Corpus readers {#corpus-readers source="spacy/training/corpus.py" new="3"} Corpus readers are registered functions that load data and return a function that takes the current `nlp` object and yields [`Example`](/api/example) objects @@ -478,7 +538,7 @@ with your own registered function in the [`@readers` registry](/api/top-level#registry) to customize the data loading and streaming. -### Corpus {#corpus} +#### spacy.Corpus.v1 {#corpus tag="registered function"} The `Corpus` reader manages annotated corpora and can be used for training and development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see @@ -498,19 +558,21 @@ the [`Corpus`](/api/corpus) class. > limit = 0 > ``` -| Name | Description | -| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ | -|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | -| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | -| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | +| Name | Description | +| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ | +|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | +| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | +| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | +| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ | +| **CREATES** | The corpus reader. ~~Corpus~~ | -### JsonlReader {#jsonlreader} +#### spacy.JsonlCorpus.v1 {#jsonlcorpus tag="registered function"} Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON) file of texts keyed by `"text"`. Can be used to read the raw text corpus for language model [pretraining](/usage/embeddings-transformers#pretraining) from a -JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. +JSONL file. Also see the [`JsonlCorpus`](/api/corpus#jsonlcorpus) class. > #### Example config > @@ -519,7 +581,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. > pretrain = "corpus/raw_text.jsonl" > > [corpora.pretrain] -> @readers = "spacy.JsonlReader.v1" +> @readers = "spacy.JsonlCorpus.v1" > path = ${paths.pretrain} > min_length = 0 > max_length = 0 @@ -532,6 +594,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. | `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | | `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | +| **CREATES** | The corpus reader. ~~JsonlCorpus~~ | ## Batchers {#batchers source="spacy/training/batchers.py" new="3"} @@ -547,7 +610,7 @@ Instead of using one of the built-in batchers listed here, you can also [implement your own](/usage/training#custom-code-readers-batchers), which may or may not use a custom schedule. -### batch_by_words {#batch_by_words tag="registered function"} +### spacy.batch_by_words.v1 {#batch_by_words tag="registered function"} Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by @@ -573,8 +636,9 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument | `tolerance` | What percentage of the size to allow batches to exceed. ~~float~~ | | `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | +| **CREATES** | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~ | -### batch_by_sequence {#batch_by_sequence tag="registered function"} +### spacy.batch_by_sequence.v1 {#batch_by_sequence tag="registered function"} > #### Example config > @@ -591,8 +655,9 @@ Create a batcher that creates batches of the specified size. | ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | +| **CREATES** | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~ | -### batch_by_padded {#batch_by_padded tag="registered function"} +### spacy.batch_by_padded.v1 {#batch_by_padded tag="registered function"} > #### Example config > @@ -616,6 +681,42 @@ sequences in the batch. | `buffer` | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ | | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | +| **CREATES** | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~ | + +## Augmenters {#augmenters source="spacy/training/augment.py" new="3"} + +Data augmentation is the process of applying small modifications to the training +data. It can be especially useful for punctuation and case replacement – for +example, if your corpus only uses smart quotes and you want to include +variations using regular quotes, or to make the model less sensitive to +capitalization by including a mix of capitalized and lowercase examples. See the [usage guide](/usage/training#data-augmentation) for details and examples. + +### spacy.orth_variants.v1 {#orth_variants tag="registered function"} + +> #### Example config +> +> ```ini +> [corpora.train.augmenter] +> @augmenters = "spacy.orth_variants.v1" +> level = 0.1 +> lower = 0.5 +> +> [corpora.train.augmenter.orth_variants] +> @readers = "srsly.read_json.v1" +> path = "corpus/en_orth_variants.json" +> ``` + +Create a data augmentation callback that uses orth-variant replacement. The +callback can be added to a corpus or other data iterator during training. This +is especially useful for punctuation and case replacement, to help generalize +beyond corpora that don't have smart quotes, or only have smart quotes etc. + +| Name | Description | +| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `level` | The percentage of texts that will be augmented. ~~float~~ | +| `lower` | The percentage of texts that will be lowercased. ~~float~~ | +| `orth_variants` | A dictionary containing the single and paired orth variants. Typically loaded from a JSON file. See [`en_orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. ~~Dict[str, Dict[List[Union[str, List[str]]]]]~~ | +| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ | ## Training data and alignment {#gold source="spacy/training"} @@ -623,7 +724,7 @@ sequences in the batch. Encode labelled spans into per-token tags, using the [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, -Out). Returns a list of strings, describing the tags. Each tag string will be of +Out). Returns a list of strings, describing the tags. Each tag string will be in the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. The string `"-"` is used where the entity offsets don't align with the tokenization in the `Doc` object. The training algorithm @@ -726,7 +827,7 @@ utilities. ### util.get_lang_class {#util.get_lang_class tag="function"} Import and load a `Language` class. Allows lazy-loading -[language data](/usage/adding-languages) and importing languages using the +[language data](/usage/linguistic-features#language-data) and importing languages using the two-letter language code. To add a language code for a custom language class, you can register it using the [`@registry.languages`](/api/top-level#registry) decorator. @@ -747,7 +848,7 @@ decorator. ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"} Check whether a `Language` subclass is already loaded. `Language` subclasses are -loaded lazily, to avoid expensive setup code associated with the language data. +loaded lazily to avoid expensive setup code associated with the language data. > #### Example > @@ -935,7 +1036,7 @@ Compile a sequence of prefix rules into a regex object. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- | | `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ | +| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"} @@ -952,7 +1053,7 @@ Compile a sequence of suffix rules into a regex object. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- | | `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ | +| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.compile_infix_regex {#util.compile_infix_regex tag="function"} @@ -969,7 +1070,7 @@ Compile a sequence of infix rules into a regex object. | Name | Description | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- | | `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ | +| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.minibatch {#util.minibatch tag="function" new="2"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index d5bcef229..abceeff4f 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Transformer.begin_training {#begin_training tag="method"} +## Transformer.initialize {#initialize tag="method"} Initialize the component for training and return an [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a @@ -167,26 +167,25 @@ examples are used to **initialize the model** of the component and can either be the full training data or a representative sample. Initialization includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +setting up the label scheme based on the data. This method is typically called +by [`Language.initialize`](/api/language#initialize). > #### Example > > ```python > trf = nlp.add_pipe("transformer") -> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline) +> trf.initialize(lambda: [], nlp=nlp) > ``` | Name | Description | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | -| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| **RETURNS** | The optimizer. ~~Optimizer~~ | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ## Transformer.predict {#predict tag="method"} -Apply the component's model to a batch of [`Doc`](/api/doc) objects, without +Apply the component's model to a batch of [`Doc`](/api/doc) objects without modifying them. > #### Example @@ -203,7 +202,7 @@ modifying them. ## Transformer.set_annotations {#set_annotations tag="method"} -Assign the extracted features to the Doc objects. By default, the +Assign the extracted features to the `Doc` objects. By default, the [`TransformerData`](/api/transformer#transformerdata) object is written to the [`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations` callback is then called, if provided. @@ -241,7 +240,7 @@ and call the optimizer, while the others simply increment the gradients. > > ```python > trf = nlp.add_pipe("transformer") -> optimizer = nlp.begin_training() +> optimizer = nlp.initialize() > losses = trf.update(examples, sgd=optimizer) > ``` @@ -272,7 +271,7 @@ Create an optimizer for the pipeline component. ## Transformer.use_params {#use_params tag="method, contextmanager"} -Modify the pipe's model, to use the given parameter values. At the end of the +Modify the pipe's model to use the given parameter values. At the end of the context, the original parameters are restored. > #### Example @@ -388,8 +387,8 @@ by this class. Instances of this class are typically assigned to the | Name | Description | | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ | -| `tensors` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ | +| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ | +| `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ | | `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | | `width` | The width of the last hidden layer. ~~int~~ | @@ -409,7 +408,7 @@ objects to associate the outputs to each [`Doc`](/api/doc) in the batch. | Name | Description | | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ | +| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ | | `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ | | `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ | | `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | @@ -439,10 +438,10 @@ Split a `TransformerData` object that represents a batch into a list with one ## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} Span getters are functions that take a batch of [`Doc`](/api/doc) objects and -return a lists of [`Span`](/api/span) objects for each doc, to be processed by -the transformer. This is used to manage long documents, by cutting them into +return a lists of [`Span`](/api/span) objects for each doc to be processed by +the transformer. This is used to manage long documents by cutting them into smaller sequences before running the transformer. The spans are allowed to -overlap, and you can also omit sections of the Doc if they are not relevant. +overlap, and you can also omit sections of the `Doc` if they are not relevant. Span getters can be referenced in the `[components.transformer.model.get_spans]` block of the config to customize the sequences processed by the transformer. You diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 7e97b4ca3..ba2d5ab42 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -290,7 +290,7 @@ If a table is full, it can be resized using ## Vectors.n_keys {#n_keys tag="property"} Get the number of keys in the table. Note that this is the number of _all_ keys, -not just unique vectors. If several keys are mapped are mapped to the same +not just unique vectors. If several keys are mapped to the same vectors, they will be counted individually. > #### Example @@ -307,10 +307,10 @@ vectors, they will be counted individually. ## Vectors.most_similar {#most_similar tag="method"} -For each of the given vectors, find the `n` most similar entries to it, by +For each of the given vectors, find the `n` most similar entries to it by cosine. Queries are by vector. Results are returned as a `(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are -performed in chunks, to avoid consuming too much memory. You can set the +performed in chunks to avoid consuming too much memory. You can set the `batch_size` to control the size/space trade-off during the calculations. > #### Example diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index 71a678cb3..a2ca63002 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -29,7 +29,7 @@ Create the vocabulary. | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | | `vectors_name` 2.2 | A name to identify the vectors table. ~~str~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | -| `get_noun_chunks` | A function that yields base noun phrases, used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ | +| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -150,7 +150,7 @@ rows, we would discard the vectors for "feline" and "reclined". These words would then be remapped to the closest remaining vector – so "feline" would have the same vector as "cat", and "reclined" would have the same vector as "sat". The similarities are judged by cosine. The original vectors may be large, so the -cosines are calculated in minibatches, to reduce memory usage. +cosines are calculated in minibatches to reduce memory usage. > #### Example > @@ -170,7 +170,7 @@ cosines are calculated in minibatches, to reduce memory usage. Retrieve a vector for a word in the vocabulary. Words can be looked up by string or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn` is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s -subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`). +subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`). > #### Example > @@ -182,13 +182,13 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`). | Name | Description | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | | `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ | -| `minn` 2.1 | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ | -| `maxn` 2.1 | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ | +| `minn` 2.1 | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | +| `maxn` 2.1 | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ | | **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vocab.set_vector {#set_vector tag="method" new="2"} -Set a vector for a word in the vocabulary. Words can be referenced by by string +Set a vector for a word in the vocabulary. Words can be referenced by string or hash value. > #### Example diff --git a/website/docs/images/lifecycle.svg b/website/docs/images/lifecycle.svg new file mode 100644 index 000000000..2f4b304b8 --- /dev/null +++ b/website/docs/images/lifecycle.svg @@ -0,0 +1,93 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/usage/101/_pipelines.md b/website/docs/usage/101/_pipelines.md index 9a63ee42d..f43219f41 100644 --- a/website/docs/usage/101/_pipelines.md +++ b/website/docs/usage/101/_pipelines.md @@ -32,7 +32,7 @@ the [config](/usage/training#config): ```ini [nlp] -pipeline = ["tagger", "parser", "ner"] +pipeline = ["tok2vec", "tagger", "parser", "ner"] ``` import Accordion from 'components/accordion.js' diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 97249bfb2..c615097d6 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -41,8 +41,8 @@ transformers is that word vectors model **lexical types**, rather than _tokens_. If you have a list of terms with no context around them, a transformer model like BERT can't really help you. BERT is designed to understand language **in context**, which isn't what you have. A word vectors table will be a much better -fit for your task. However, if you do have words in context — whole sentences or -paragraphs of running text — word vectors will only provide a very rough +fit for your task. However, if you do have words in context – whole sentences or +paragraphs of running text – word vectors will only provide a very rough approximation of what the text is about. Word vectors are also very computationally efficient, as they map a word to a @@ -256,7 +256,7 @@ for doc in nlp.pipe(["some text", "some other text"]): ``` You can also customize how the [`Transformer`](/api/transformer) component sets -annotations onto the [`Doc`](/api/doc), by specifying a custom +annotations onto the [`Doc`](/api/doc) by specifying a custom `set_extra_annotations` function. This callback will be called with the raw input and output data for the whole batch, along with the batch of `Doc` objects, allowing you to implement whatever you need. The annotation setter is @@ -496,7 +496,7 @@ algorithms for learning word vector tables. You can train a word vectors table using tools such as [Gensim](https://radimrehurek.com/gensim/), [FastText](https://fasttext.cc/) or [GloVe](https://nlp.stanford.edu/projects/glove/), or download existing -pretrained vectors. The [`init vocab`](/api/cli#init-vocab) command lets you +pretrained vectors. The [`init vectors`](/api/cli#init-vectors) command lets you convert vectors for use with spaCy and will give you a directory you can load or refer to in your [training configs](/usage/training#config). @@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned embeddings. ```python -from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor +from thinc.api import add, chain, remap_ids, Embed from spacy.ml.staticvectors import StaticVectors +from spacy.ml.featureextractor import FeatureExtractor from spacy.util import registry @registry.architectures("my_example.MyEmbedding.v1") @@ -621,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`, `[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and expect the same types of objects, although for pretraining your corpus does not need to have any annotations, so you will often use a different reader, such as -the [`JsonlReader`](/api/top-level#jsonlreader). +the [`JsonlCorpus`](/api/top-level#jsonlcorpus). > #### Raw text format > @@ -675,7 +676,7 @@ given you a 10% error reduction, pretraining with spaCy might give you another The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific subnetwork** within one of your components, and add additional layers to build a -network for a temporary task, that forces the model to learn something about +network for a temporary task that forces the model to learn something about sentence structure and word cooccurrence statistics. Pretraining produces a **binary weights file** that can be loaded back in at the start of training. The weights file specifies an initial set of weights. Training then proceeds as diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index a58ba2ba9..b65c3d903 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure that their internal models are **always initialized** with appropriate sample data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a ~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This -functionality is triggered when -[`nlp.begin_training`](/api/language#begin_training) is called. +functionality is triggered when [`nlp.initialize`](/api/language#initialize) is +called. ### Dropout and normalization in Thinc {#thinc-dropout-norm} @@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}): diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index d9a894398..7b9aaa0b9 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -56,16 +56,13 @@ create a surface form. Here are some examples: Morphological features are stored in the [`MorphAnalysis`](/api/morphanalysis) under `Token.morph`, which allows you to access individual morphological -features. The attribute `Token.morph_` provides the morphological analysis in -the Universal Dependencies -[FEATS](https://universaldependencies.org/format.html#morphological-annotation) -format. +features. > #### 📝 Things to try > > 1. Change "I" to "She". You should see that the morphological features change > and express that it's a pronoun in the third person. -> 2. Inspect `token.morph_` for the other tokens. +> 2. Inspect `token.morph` for the other tokens. ```python ### {executable="true"} @@ -75,7 +72,7 @@ nlp = spacy.load("en_core_web_sm") print("Pipeline:", nlp.pipe_names) doc = nlp("I was reading the paper.") token = doc[0] # 'I' -print(token.morph_) # 'Case=Nom|Number=Sing|Person=1|PronType=Prs' +print(token.morph) # 'Case=Nom|Number=Sing|Person=1|PronType=Prs' print(token.morph.get("PronType")) # ['Prs'] ``` @@ -91,7 +88,7 @@ import spacy nlp = spacy.load("de_core_news_sm") doc = nlp("Wo bist du?") # English: 'Where are you?' -print(doc[2].morph_) # 'Case=Nom|Number=Sing|Person=2|PronType=Prs' +print(doc[2].morph) # 'Case=Nom|Number=Sing|Person=2|PronType=Prs' print(doc[2].pos_) # 'PRON' ``` @@ -117,7 +114,7 @@ import spacy nlp = spacy.load("en_core_web_sm") doc = nlp("Where are you?") -print(doc[2].morph_) # 'Case=Nom|Person=2|PronType=Prs' +print(doc[2].morph) # 'Case=Nom|Person=2|PronType=Prs' print(doc[2].pos_) # 'PRON' ``` @@ -1834,10 +1831,12 @@ word vector libraries output an easy-to-read text-based format, where each line consists of the word followed by its vector. For everyday use, we want to convert the vectors into a binary format that loads faster and takes up less space on disk. The easiest way to do this is the -[`init vocab`](/api/cli#init-vocab) command-line utility. This will output a +[`init vectors`](/api/cli#init-vectors) command-line utility. This will output a blank spaCy pipeline in the directory `/tmp/la_vectors_wiki_lg`, giving you access to some nice Latin vectors. You can then pass the directory path to -[`spacy.load`](/api/top-level#spacy.load). +[`spacy.load`](/api/top-level#spacy.load) or use it in the +[`[initialize]`](/api/data-formats#config-initialize) of your config when you +[train](/usage/training) a model. > #### Usage example > @@ -1850,7 +1849,7 @@ access to some nice Latin vectors. You can then pass the directory path to ```cli $ wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz -$ python -m spacy init vocab en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz +$ python -m spacy init vectors en cc.la.300.vec.gz /tmp/la_vectors_wiki_lg ``` @@ -1858,9 +1857,9 @@ $ python -m spacy init vocab en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300. To help you strike a good balance between coverage and memory usage, spaCy's [`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same row** of the table. If you're using the -[`spacy init vocab`](/api/cli#init-vocab) command to create a vocabulary, -pruning the vectors will be taken care of automatically if you set the -`--prune-vectors` flag. You can also do it manually in the following steps: +[`spacy init vectors`](/api/cli#init-vectors) command to create a vocabulary, +pruning the vectors will be taken care of automatically if you set the `--prune` +flag. You can also do it manually in the following steps: 1. Start with a **word vectors package** that covers a huge vocabulary. For instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) @@ -1905,12 +1904,12 @@ the two words. In the example above, the vector for "Shore" was removed and remapped to the vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to the vector of "leaving", which is identical. If you're using the -[`init vocab`](/api/cli#init-vocab) command, you can set the `--prune-vectors` +[`init vectors`](/api/cli#init-vectors) command, you can set the `--prune` option to easily reduce the size of the vectors as you add them to a spaCy pipeline: ```cli -$ python -m spacy init vocab en /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000 +$ python -m spacy init vectors en la.300d.vec.tgz /tmp/la_vectors_web_md --prune 10000 ``` This will create a blank spaCy pipeline with vectors for the first 10,000 words diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 9b686c947..dc41385f2 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -30,7 +30,7 @@ import QuickstartModels from 'widgets/quickstart-models.js' ## Language support {#languages} spaCy currently provides support for the following languages. You can help by -[improving the existing language data](/usage/adding-languages#language-data) +improving the existing [language data](/usage/linguistic-features#language-data) and extending the tokenization patterns. [See here](https://github.com/explosion/spaCy/issues/3056) for details on how to contribute to development. @@ -83,74 +83,95 @@ To train a pipeline using the neutral multi-language class, you can set import the `MultiLanguage` class directly, or call [`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading. -### Chinese language support {#chinese new=2.3} +### Chinese language support {#chinese new="2.3"} -The Chinese language class supports three word segmentation options: +The Chinese language class supports three word segmentation options, `char`, +`jieba` and `pkuseg`. +> #### Manual setup +> > ```python > from spacy.lang.zh import Chinese > > # Character segmentation (default) > nlp = Chinese() -> > # Jieba > cfg = {"segmenter": "jieba"} -> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) -> +> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}}) > # PKUSeg with "default" model provided by pkuseg -> cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"} -> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +> cfg = {"segmenter": "pkuseg"} +> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}}) +> nlp.tokenizer.initialize(pkuseg_model="default") > ``` -1. **Character segmentation:** Character segmentation is the default - segmentation option. It's enabled when you create a new `Chinese` language - class or call `spacy.blank("zh")`. -2. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word - segmentation with the tokenizer option `{"segmenter": "jieba"}`. -3. **PKUSeg**: As of spaCy v2.3.0, support for - [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support - better segmentation for Chinese OntoNotes and the provided - [Chinese pipelines](/models/zh). Enable PKUSeg with the tokenizer option - `{"segmenter": "pkuseg"}`. - - - -In spaCy v3.0, the default Chinese word segmenter has switched from Jieba to -character segmentation. Also note that -[`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship with -pre-compiled wheels for Python 3.8. If you're running Python 3.8, you can -install it from our fork and compile it locally: - -```bash -$ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip +```ini +### config.cfg +[nlp.tokenizer] +@tokenizers = "spacy.zh.ChineseTokenizer" +segmenter = "char" ``` +| Segmenter | Description | +| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `char` | **Character segmentation:** Character segmentation is the default segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`. | +| `jieba` | **Jieba:** to use [Jieba](https://github.com/fxsjy/jieba) for word segmentation, you can set the option `segmenter` to `"jieba"`. | +| `pkuseg` | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. | + + + +In v3.0, the default word segmenter has switched from Jieba to character +segmentation. Because the `pkuseg` segmenter depends on a model that can be +loaded from a file, the model is loaded on +[initialization](/usage/training#config-lifecycle) (typically before training). +This ensures that your packaged Chinese model doesn't depend on a local path at +runtime. + -The `meta` argument of the `Chinese` language class supports the following -following tokenizer config settings: +The `initialize` method for the Chinese tokenizer class supports the following +config settings for loading `pkuseg` models: -| Name | Description | -| ------------------ | --------------------------------------------------------------------------------------------------------------- | -| `segmenter` | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. ~~str~~ | -| `pkuseg_model` | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ | -| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. ~~str~~ | +| Name | Description | +| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------- | +| `pkuseg_model` | Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ | +| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`. ~~str~~ | + +The initialization settings are typically provided in the +[training config](/usage/training#config) and the data is loaded in before +training and serialized with the model. This allows you to load the data from a +local path and save out your pipeline and config, without requiring the same +local path at runtime. See the usage guide on the +[config lifecycle](/usage/training#config-lifecycle) for more background on +this. + +```ini +### config.cfg +[initialize] + +[initialize.tokenizer] +pkuseg_model = "/path/to/model" +pkuseg_user_dict = "default" +``` + +You can also initialize the tokenizer for a blank language class by calling its +`initialize` method: ```python ### Examples +# Initialize the pkuseg tokenizer +cfg = {"segmenter": "pkuseg"} +nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}}) + # Load "default" model -cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"} -nlp = Chinese(config={"tokenizer": {"config": cfg}}) +nlp.tokenizer.initialize(pkuseg_model="default") # Load local model -cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"} -nlp = Chinese(config={"tokenizer": {"config": cfg}}) +nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model") # Override the user directory -cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"} -nlp = Chinese(config={"tokenizer": {"config": cfg}}) +nlp.tokenizer.initialize(pkuseg_model="default", pkuseg_user_dict="/path/to/user_dict") ``` You can also modify the user dictionary on-the-fly: @@ -185,36 +206,46 @@ from spacy.lang.zh import Chinese # Train pkuseg model pkuseg.train("train.utf8", "test.utf8", "/path/to/pkuseg_model") + # Load pkuseg model in spaCy Chinese tokenizer -nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}}}) +cfg = {"segmenter": "pkuseg"} +nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}}) +nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model") ``` ### Japanese language support {#japanese new=2.3} +> #### Manual setup +> > ```python > from spacy.lang.ja import Japanese > > # Load SudachiPy with split mode A (default) > nlp = Japanese() -> > # Load SudachiPy with split mode B > cfg = {"split_mode": "B"} -> nlp = Japanese(meta={"tokenizer": {"config": cfg}}) +> nlp = Japanese.from_config({"nlp": {"tokenizer": cfg}}) > ``` The Japanese language class uses [SudachiPy](https://github.com/WorksApplications/SudachiPy) for word segmentation and part-of-speech tagging. The default Japanese language class and -the provided Japanese pipelines use SudachiPy split mode `A`. The `meta` -argument of the `Japanese` language class can be used to configure the split -mode to `A`, `B` or `C`. +the provided Japanese pipelines use SudachiPy split mode `A`. The tokenizer +config can be used to configure the split mode to `A`, `B` or `C`. + +```ini +### config.cfg +[nlp.tokenizer] +@tokenizers = "spacy.ja.JapaneseTokenizer" +split_mode = "A" +``` If you run into errors related to `sudachipy`, which is currently under active -development, we suggest downgrading to `sudachipy==0.4.5`, which is the version +development, we suggest downgrading to `sudachipy==0.4.9`, which is the version used for training the current [Japanese pipelines](/models/ja). diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index dbf0881ac..c98bd08bc 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -167,8 +167,8 @@ the binary data: ```python ### spacy.load under the hood lang = "en" -pipeline = ["tagger", "parser", "ner"] -data_path = "path/to/en_core_web_sm/en_core_web_sm-2.0.0" +pipeline = ["tok2vec", "tagger", "parser", "ner"] +data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0" cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English nlp = cls() # 2. Initialize it @@ -197,9 +197,9 @@ list of human-readable component names. ```python print(nlp.pipeline) -# [('tagger', ), ('parser', ), ('ner', )] +# [('tok2vec', ), ('tagger', ), ('parser', ), ('ner', )] print(nlp.pipe_names) -# ['tagger', 'parser', 'ner'] +# ['tok2vec', 'tagger', 'parser', 'ner'] ``` ### Built-in pipeline components {#built-in} @@ -895,6 +895,10 @@ the name. Registered functions can also take **arguments** by the way that can be defined in the config as well – you can read more about this in the docs on [training with custom code](/usage/training#custom-code). +### Initializing components with data {#initialization} + + + ### Python type hints and pydantic validation {#type-hints new="3"} spaCy's configs are powered by our machine learning library Thinc's @@ -1126,12 +1130,12 @@ For some use cases, it makes sense to also overwrite additional methods to customize how the model is updated from examples, how it's initialized, how the loss is calculated and to add evaluation scores to the training output. -| Name | Description | -| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. | -| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. | -| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | -| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | +| Name | Description | +| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. | +| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. | +| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | +| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 06fb18591..f8a5eea2a 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -291,7 +291,7 @@ installed in the same environment – that's it. | Entry point | Description | | ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | [`spacy_factories`](#entry-points-components) | Group of entry points for pipeline component factories, keyed by component name. Can be used to expose custom components defined by another package. | -| [`spacy_languages`](#entry-points-languages) | Group of entry points for custom [`Language` subclasses](/usage/adding-languages), keyed by language shortcut. | +| [`spacy_languages`](#entry-points-languages) | Group of entry points for custom [`Language` subclasses](/usage/linguistic-features#language-data), keyed by language shortcut. | | `spacy_lookups` 2.2 | Group of entry points for custom [`Lookups`](/api/lookups), including lemmatizer data. Used by spaCy's [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) package. | | [`spacy_displacy_colors`](#entry-points-displacy) 2.2 | Group of entry points of custom label colors for the [displaCy visualizer](/usage/visualizers#ent). The key name doesn't matter, but it should point to a dict of labels and color values. Useful for custom models that predict different entity types. | diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index cd1b2cb0c..5d7c7d7a5 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -200,7 +200,7 @@ import Tokenization101 from 'usage/101/\_tokenization.md' To learn more about how spaCy's tokenization rules work in detail, how to **customize and replace** the default tokenizer and how to **add language-specific data**, see the usage guides on -[adding languages](/usage/adding-languages) and +[language data](/usage/linguistic-features#language-data) and [customizing the tokenizer](/usage/linguistic-features#tokenization). @@ -479,7 +479,7 @@ find a "Suggest edits" link at the bottom of each page that points you to the source. Another way of getting involved is to help us improve the -[language data](/usage/adding-languages#language-data) – especially if you +[language data](/usage/linguistic-features#language-data) – especially if you happen to speak one of the languages currently in [alpha support](/usage/models#languages). Even adding simple tokenizer exceptions, stop words or lemmatizer data can make a big difference. It will diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 97992287b..1dd57fd4a 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -6,8 +6,9 @@ menu: - ['Introduction', 'basics'] - ['Quickstart', 'quickstart'] - ['Config System', 'config'] - + - ['Custom Training', 'config-custom'] - ['Custom Functions', 'custom-functions'] + - ['Data Utilities', 'data'] - ['Parallel Training', 'parallel-training'] - ['Internal API', 'api'] --- @@ -122,7 +123,7 @@ treebank. -## Training config {#config} +## Training config system {#config} Training config files include all **settings and hyperparameters** for training your pipeline. Instead of providing lots of arguments on the command line, you @@ -177,6 +178,7 @@ sections of a config file are: | `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. | | `training` | Settings and controls for the training and evaluation process. | | `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining). | +| `initialize` | Data resources and arguments passed to components when [`nlp.initialize`](/api/language#initialize) is called before training (but not at runtime). | @@ -190,6 +192,34 @@ available for the different architectures are documented with the +### Config lifecycle at runtime and training {#config-lifecycle} + +A pipeline's `config.cfg` is considered the "single source of truth", both at +**training** and **runtime**. Under the hood, +[`Language.from_config`](/api/language#from_config) takes care of constructing +the `nlp` object using the settings defined in the config. An `nlp` object's +config is available as [`nlp.config`](/api/language#config) and it includes all +information about the pipeline, as well as the settings used to train and +initialize it. + +![Illustration of pipeline lifecycle](../images/lifecycle.svg) + +At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the +config and load all data, including tokenization rules, model weights and other +resources from the pipeline directory. The `[training]` block contains the +settings for training the model and is only used during training. Similarly, the +`[initialize]` block defines how the initial `nlp` object should be set up +before training and whether it should be initialized with vectors or pretrained +tok2vec weights, or any other data needed by the components. + +The initialization settings are only loaded and used when +[`nlp.initialize`](/api/language#initialize) is called (typically right before +training). This allows you to set up your pipeline using local data resources +and custom functions, and preserve the information in your config – but without +requiring it to be available at runtime. You can also use this mechanism to +provide data paths to custom pipeline components and custom tokenizers – see the +section on [custom initialization](#initialization) for details. + ### Overwriting config settings on the command line {#config-overrides} The config system means that you can define all settings **in one place** and in @@ -233,6 +263,61 @@ defined in the config file. $ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh ``` +### Using variable interpolation {#config-interpolation} + +Another very useful feature of the config system is that it supports variable +interpolation for both **values and sections**. This means that you only need to +define a setting once and can reference it across your config using the +`${section.value}` syntax. In this example, the value of `seed` is reused within +the `[training]` block, and the whole block of `[training.optimizer]` is reused +in `[pretraining]` and will become `pretraining.optimizer`. + +```ini +### config.cfg (excerpt) {highlight="5,18"} +[system] +seed = 0 + +[training] +seed = ${system.seed} + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 1e-8 + +[pretraining] +optimizer = ${training.optimizer} +``` + +You can also use variables inside strings. In that case, it works just like +f-strings in Python. If the value of a variable is not a string, it's converted +to a string. + +```ini +[paths] +version = 5 +root = "/Users/you/data" +train = "${paths.root}/train_${paths.version}.spacy" +# Result: /Users/you/data/train_5.spacy +``` + + + +If you need to change certain values between training runs, you can define them +once, reference them as variables and then [override](#config-overrides) them on +the CLI. For example, `--paths.root /other/root` will change the value of `root` +in the block `[paths]` and the change will be reflected across all other values +that reference this variable. + + + +## Customizing the pipeline and training {#config-custom} + ### Defining pipeline components {#config-components} You typically train a [pipeline](/usage/processing-pipelines) of **one or more @@ -353,59 +438,6 @@ stop = 1000 compound = 1.001 ``` -### Using variable interpolation {#config-interpolation} - -Another very useful feature of the config system is that it supports variable -interpolation for both **values and sections**. This means that you only need to -define a setting once and can reference it across your config using the -`${section.value}` syntax. In this example, the value of `seed` is reused within -the `[training]` block, and the whole block of `[training.optimizer]` is reused -in `[pretraining]` and will become `pretraining.optimizer`. - -```ini -### config.cfg (excerpt) {highlight="5,18"} -[system] -seed = 0 - -[training] -seed = ${system.seed} - -[training.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = false -eps = 1e-8 - -[pretraining] -optimizer = ${training.optimizer} -``` - -You can also use variables inside strings. In that case, it works just like -f-strings in Python. If the value of a variable is not a string, it's converted -to a string. - -```ini -[paths] -version = 5 -root = "/Users/you/data" -train = "${paths.root}/train_${paths.version}.spacy" -# Result: /Users/you/data/train_5.spacy -``` - - - -If you need to change certain values between training runs, you can define them -once, reference them as variables and then [override](#config-overrides) them on -the CLI. For example, `--paths.root /other/root` will change the value of `root` -in the block `[paths]` and the change will be reflected across all other values -that reference this variable. - - - ### Model architectures {#model-architectures} > #### 💡 Model type annotations @@ -506,17 +538,7 @@ still look good. - - -## Custom Functions {#custom-functions} +## Custom functions {#custom-functions} Registered functions in the training config files can refer to built-in implementations, but you can also plug in fully **custom implementations**. All @@ -763,7 +785,136 @@ start = 2 factor = 1.005 ``` -#### Example: Custom data reading and batching {#custom-code-readers-batchers} +### Defining custom architectures {#custom-architectures} + +Built-in pipeline components such as the tagger or named entity recognizer are +constructed with default neural network [models](/api/architectures). You can +change the model architecture entirely by implementing your own custom models +and providing those in the config when creating the pipeline component. See the +documentation on [layers and model architectures](/usage/layers-architectures) +for more details. + +> ```ini +> ### config.cfg +> [components.tagger] +> factory = "tagger" +> +> [components.tagger.model] +> @architectures = "custom_neural_network.v1" +> output_width = 512 +> ``` + +```python +### functions.py +from typing import List +from thinc.types import Floats2d +from thinc.api import Model +import spacy +from spacy.tokens import Doc + +@spacy.registry.architectures("custom_neural_network.v1") +def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]: + return create_model(output_width) +``` + +### Customizing the initialization {#initialization} + + + +## Data utilities {#data} + +spaCy includes various features and utilities to make it easy to train models +using your own data, manage training and evaluation corpora, convert existing +annotations and configure data augmentation strategies for more robust models. + +### Converting existing corpora and annotations {#data-convert} + +If you have training data in a standard format like `.conll` or `.conllu`, the +easiest way to convert it for use with spaCy is to run +[`spacy convert`](/api/cli#convert) and pass it a file and an output directory. +By default, the command will pick the converter based on the file extension. + +```cli +$ python -m spacy convert ./train.gold.conll ./corpus +``` + +> #### 💡 Tip: Converting from Prodigy +> +> If you're using the [Prodigy](https://prodi.gy) annotation tool to create +> training data, you can run the +> [`data-to-spacy` command](https://prodi.gy/docs/recipes#data-to-spacy) to +> merge and export multiple datasets for use with +> [`spacy train`](/api/cli#train). Different types of annotations on the same +> text will be combined, giving you one corpus to train multiple components. + + + +Training workflows often consist of multiple steps, from preprocessing the data +all the way to packaging and deploying the trained model. +[spaCy projects](/usage/projects) let you define all steps in one file, manage +data assets, track changes and share your end-to-end processes with your team. + + + +The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing +one or more [`Doc`](/api/doc) objects. It's is extremely **efficient in +storage**, especially when packing multiple documents together. You can also +create `Doc` objects manually, so you can write your own custom logic to convert +and store existing annotations for use in spaCy. + +```python +### Training data from Doc objects {highlight="6-9"} +import spacy +from spacy.tokens import Doc, DocBin + +nlp = spacy.blank("en") +docbin = DocBin(nlp.vocab) +words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."] +spaces = [True, True, True, True, True, True, True, False] +ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"] +doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents) +docbin.add(doc) +docbin.to_disk("./train.spacy") +``` + +### Working with corpora {#data-corpora} + +> #### Example +> +> ```ini +> [corpora] +> +> [corpora.train] +> @readers = "spacy.Corpus.v1" +> path = ${paths.train} +> gold_preproc = false +> max_length = 0 +> limit = 0 +> augmenter = null +> +> [training] +> train_corpus = "corpora.train" +> ``` + +The [`[corpora]`](/api/data-formats#config-corpora) block in your config lets +you define **data resources** to use for training, evaluation, pretraining or +any other custom workflows. `corpora.train` and `corpora.dev` are used as +conventions within spaCy's default configs, but you can also define any other +custom blocks. Each section in the corpora config should resolve to a +[`Corpus`](/api/corpus) – for example, using spaCy's built-in +[corpus reader](/api/top-level#readers) that takes a path to a binary `.spacy` +file. The `train_corpus` and `dev_corpus` fields in the +[`[training]`](/api/data-formats#config-training) block specify where to find +the corpus in your config. This makes it easy to **swap out** different corpora +by only changing a single config setting. + +Instead of making `[corpora]` a block with multiple subsections for each portion +of the data, you can also use a single function that returns a dictionary of +corpora, keyed by corpus name, e.g. `"train"` and `"dev"`. This can be +especially useful if you need to split a single file into corpora for training +and evaluation, without loading the same file twice. + +### Custom data reading and batching {#custom-code-readers-batchers} Some use-cases require **streaming in data** or manipulating datasets on the fly, rather than generating all data beforehand and storing it to file. Instead @@ -859,38 +1010,139 @@ def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Examp return create_filtered_batches ``` -### Defining custom architectures {#custom-architectures} + -Built-in pipeline components such as the tagger or named entity recognizer are -constructed with default neural network [models](/api/architectures). You can -change the model architecture entirely by implementing your own custom models -and providing those in the config when creating the pipeline component. See the -documentation on [layers and model architectures](/usage/layers-architectures) -for more details. +### Data augmentation {#data-augmentation} -> ```ini -> ### config.cfg -> [components.tagger] -> factory = "tagger" +Data augmentation is the process of applying small **modifications** to the +training data. It can be especially useful for punctuation and case replacement +– for example, if your corpus only uses smart quotes and you want to include +variations using regular quotes, or to make the model less sensitive to +capitalization by including a mix of capitalized and lowercase examples. + +The easiest way to use data augmentation during training is to provide an +`augmenter` to the training corpus, e.g. in the `[corpora.train]` section of +your config. The built-in [`orth_variants`](/api/top-level#orth_variants) +augmenter creates a data augmentation callback that uses orth-variant +replacement. + +```ini +### config.cfg (excerpt) {highlight="8,14"} +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +gold_preproc = false +max_length = 0 +limit = 0 + +[corpora.train.augmenter] +@augmenters = "spacy.orth_variants.v1" +# Percentage of texts that will be augmented / lowercased +level = 0.1 +lower = 0.5 + +[corpora.train.augmenter.orth_variants] +@readers = "srsly.read_json.v1" +path = "corpus/orth_variants.json" +``` + +The `orth_variants` argument lets you pass in a dictionary of replacement rules, +typically loaded from a JSON file. There are two types of orth variant rules: +`"single"` for single tokens that should be replaced (e.g. hyphens) and +`"paired"` for pairs of tokens (e.g. quotes). + + +```json +### orth_variants.json +{ + "single": [{ "tags": ["NFP"], "variants": ["…", "..."] }], + "paired": [{ "tags": ["``", "''"], "variants": [["'", "'"], ["‘", "’"]] }] +} +``` + + + +```json +https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json +``` + +```json +https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/de_orth_variants.json +``` + + + + + +When adding data augmentation, keep in mind that it typically only makes sense +to apply it to the **training corpus**, not the development data. + + + +#### Writing custom data augmenters {#data-augmentation-custom} + +Using the [`@spacy.augmenters`](/api/top-level#registry) registry, you can also +register your own data augmentation callbacks. The callback should be a function +that takes the current `nlp` object and a training [`Example`](/api/example) and +yields `Example` objects. Keep in mind that the augmenter should yield **all +examples** you want to use in your corpus, not only the augmented examples +(unless you want to augment all examples). + +Here'a an example of a custom augmentation callback that produces text variants +in ["SpOnGeBoB cAsE"](https://knowyourmeme.com/memes/mocking-spongebob). The +registered function takes one argument `randomize` that can be set via the +config and decides whether the uppercase/lowercase transformation is applied +randomly or not. The augmenter yields two `Example` objects: the original +example and the augmented example. + +> #### config.cfg > -> [components.tagger.model] -> @architectures = "custom_neural_network.v1" -> output_width = 512 +> ```ini +> [corpora.train.augmenter] +> @augmenters = "spongebob_augmenter.v1" +> randomize = false > ``` ```python -### functions.py -from typing import List -from thinc.types import Floats2d -from thinc.api import Model import spacy -from spacy.tokens import Doc +import random -@spacy.registry.architectures("custom_neural_network.v1") -def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]: - return create_model(output_width) +@spacy.registry.augmenters("spongebob_augmenter.v1") +def create_augmenter(randomize: bool = False): + def augment(nlp, example): + text = example.text + if randomize: + # Randomly uppercase/lowercase characters + chars = [c.lower() if random.random() < 0.5 else c.upper() for c in text] + else: + # Uppercase followed by lowercase + chars = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)] + # Create augmented training example + example_dict = example.to_dict() + doc = nlp.make_doc("".join(chars)) + example_dict["token_annotation"]["ORTH"] = [t.text for t in doc] + # Original example followed by augmented example + yield example + yield example.from_dict(doc, example_dict) + + return augment ``` +An easy way to create modified `Example` objects is to use the +[`Example.from_dict`](/api/example#from_dict) method with a new reference +[`Doc`](/api/doc) created from the modified text. In this case, only the +capitalization changes, so only the `ORTH` values of the tokens will be +different between the original and augmented examples. + +Note that if your data augmentation strategy involves changing the tokenization +(for instance, removing or adding tokens) and your training examples include +token-based annotations like the dependency parse or entity labels, you'll need +to take care to adjust the `Example` object so its annotations match and remain +valid. + ## Parallel & distributed training with Ray {#parallel-training} > #### Installation @@ -1001,17 +1253,6 @@ a dictionary with keyword arguments specifying the annotations, like `tags` or annotations, the model can be updated to learn a sentence of three words with their assigned part-of-speech tags. -> #### About the tag map -> -> The tag map is part of the vocabulary and defines the annotation scheme. If -> you're training a new pipeline, this will let you map the tags present in the -> treebank you train on to spaCy's tag scheme: -> -> ```python -> tag_map = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}} -> vocab = Vocab(tag_map=tag_map) -> ``` - ```python words = ["I", "like", "stuff"] tags = ["NOUN", "VERB", "NOUN"] @@ -1056,8 +1297,8 @@ of being dropped. > - [`nlp`](/api/language): The `nlp` object with the pipeline components and > their models. -> - [`nlp.begin_training`](/api/language#begin_training): Start the training and -> return an optimizer to update the component model weights. +> - [`nlp.initialize`](/api/language#initialize): Start the training and return +> an optimizer to update the component model weights. > - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds > state between updates. > - [`nlp.update`](/api/language#update): Update component models with examples. @@ -1068,7 +1309,7 @@ of being dropped. ```python ### Example training loop -optimizer = nlp.begin_training() +optimizer = nlp.initialize() for itn in range(100): random.shuffle(train_data) for raw_text, entity_offsets in train_data: diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 94c50e1ec..4ce57af01 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -104,7 +104,6 @@ import Benchmarks from 'usage/\_benchmarks-models.md' > > ```ini > [training] -> vectors = null > accumulate_gradient = 3 > > [training.optimizer] @@ -123,13 +122,14 @@ training run, with no hidden defaults, making it easy to rerun your experiments and track changes. You can use the [quickstart widget](/usage/training#quickstart) or the `init config` command to get started. Instead of providing lots of arguments on the command line, you -only need to pass your `config.cfg` file to `spacy train`. - +only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train). Training config files include all **settings and hyperparameters** for training your pipeline. Some settings can also be registered **functions** that you can swap out and customize, making it easy to implement your own custom models and architectures. +![Illustration of pipeline lifecycle](../images/lifecycle.svg) + - **Usage:** [Training pipelines and models](/usage/training) @@ -429,6 +429,8 @@ The following methods, attributes and commands are new in spaCy v3.0. | [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). | | [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. | | [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). | +| [`init vectors`](/api/cli#init-vectors) | Convert word vectors for use with spaCy. | +| [`init labels`](/api/cli#init-labels) | Generate JSON files for the labels in the data to speed up training. | | [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | | [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. | @@ -526,10 +528,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**. [`Pipe.update`](/api/pipe#update) methods now all take batches of [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or raw text and a dictionary of annotations. - [`Language.begin_training`](/api/language#begin_training) and - [`Pipe.begin_training`](/api/pipe#begin_training) now take a function that - returns a sequence of `Example` objects to initialize the model instead of a - list of tuples. + [`Language.initialize`](/api/language#initialize) and + [`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a + sequence of `Example` objects to initialize the model instead of a list of + tuples. +- The `begin_training` methods have been renamed to `initialize`. - [`Matcher.add`](/api/matcher#add) and [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of patterns as the second argument (instead of a variable number of arguments). @@ -555,13 +558,14 @@ Note that spaCy v3.0 now requires **Python 3.6+**. | Removed | Replacement | | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | +| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... | | `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) | | `GoldParse` | [`Example`](/api/example) | | `GoldCorpus` | [`Corpus`](/api/corpus) | | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | | `Matcher.pipe`, `PhraseMatcher.pipe` | not needed | | `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) | -| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) | +| `spacy init-model` | [`spacy init vectors`](/api/cli#init-vectors) | | `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | | `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | | `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated | @@ -721,7 +725,7 @@ nlp = spacy.blank("en") Because pipeline components are now added using their string names, you won't have to instantiate the [component classes](/api/#architecture-pipeline) -directly anynore. To configure the component, you can now use the `config` +directly anymore. To configure the component, you can now use the `config` argument on [`nlp.add_pipe`](/api/language#add_pipe). > #### config.cfg (excerpt) @@ -936,7 +940,7 @@ TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("I like London.", {"entities": [(7, 13, "LOC")]}), ] -nlp.begin_training() +nlp.initialize() for i in range(20): random.shuffle(TRAIN_DATA) for batch in minibatch(TRAIN_DATA): @@ -946,17 +950,18 @@ for i in range(20): nlp.update(examples) ``` -[`Language.begin_training`](/api/language#begin_training) and -[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that -returns a sequence of `Example` objects to initialize the model instead of a -list of tuples. The data examples are used to **initialize the models** of +`Language.begin_training` and `Pipe.begin_training` have been renamed to +[`Language.initialize`](/api/language#initialize) and +[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function +that returns a sequence of `Example` objects to initialize the model instead of +a list of tuples. The data examples are used to **initialize the models** of trainable pipeline components, which includes validating the network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme. ```diff -- nlp.begin_training(examples) -+ nlp.begin_training(lambda: examples) +- nlp.initialize(examples) ++ nlp.initialize(lambda: examples) ``` #### Packaging trained pipelines {#migrating-training-packaging} diff --git a/website/gatsby-config.js b/website/gatsby-config.js index c1a2f9ab9..4650711ac 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -1,6 +1,11 @@ const autoprefixer = require('autoprefixer') const path = require('path') +// https://florian.ec/blog/gatsby-build-netlify-segmentation-fault/ +const sharp = require('sharp') +sharp.cache(false) +sharp.simd(false) + // Markdown plugins const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js') const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js') diff --git a/website/meta/type-annotations.json b/website/meta/type-annotations.json index 79d4d357d..acbc88ae2 100644 --- a/website/meta/type-annotations.json +++ b/website/meta/type-annotations.json @@ -23,6 +23,8 @@ "PhraseMatcher": "/api/phrasematcher", "TransformerData": "/api/transformer#transformerdata", "FullTransformerBatch": "/api/transformer#fulltransformerbatch", + "Corpus": "/api/corpus", + "JsonlCorpus": "/api/corpus#jsonlcorpus", "LexemeC": "/api/cython-structs#lexemec", "TokenC": "/api/cython-structs#tokenc", "Config": "https://thinc.ai/docs/api-config#config",