diff --git a/Makefile b/Makefile index c4e77d101..46a7b22ba 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash ifndef SPACY_EXTRAS -override SPACY_EXTRAS = spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core +override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core endif ifndef PYVER diff --git a/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py b/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py index 66d96ff68..41b6a70da 100644 --- a/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py +++ b/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py @@ -1,7 +1,7 @@ from pathlib import Path import plac import spacy -from spacy.gold import docs_to_json +from spacy.training import docs_to_json import srsly import sys diff --git a/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg index a5fa32b18..e2ab148c6 100644 --- a/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg +++ b/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -31,10 +31,13 @@ lang = "en" vectors = null [nlp.pipeline.ner] -factory = "simple_ner" +factory = "ner" [nlp.pipeline.ner.model] -@architectures = "spacy.BiluoTagger.v1" +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 6 +hidden_width = 64 +maxout_pieces = 2 [nlp.pipeline.ner.model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" diff --git a/pyproject.toml b/pyproject.toml index d23730b00..e610e603e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a30,<8.0.0a40", + "thinc>=8.0.0a31,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index 9b108de8d..db6eae2ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a30,<8.0.0a40 +thinc>=8.0.0a31,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index fc33abedb..10a8972b0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a30,<8.0.0a40 + thinc>=8.0.0a31,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a30,<8.0.0a40 + thinc>=8.0.0a31,<8.0.0a40 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 @@ -64,7 +64,7 @@ console_scripts = [options.extras_require] lookups = - spacy_lookups_data>=0.3.2,<0.4.0 + spacy_lookups_data==0.4.0.dev0 cuda = cupy>=5.0.0b4,<9.0.0 cuda80 = diff --git a/setup.py b/setup.py index d448a262c..4a4b99f22 100755 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ Options.docstrings = True PACKAGES = find_packages() MOD_NAMES = [ - "spacy.gold.example", + "spacy.training.example", "spacy.parts_of_speech", "spacy.strings", "spacy.lexeme", @@ -48,7 +48,7 @@ MOD_NAMES = [ "spacy.pipeline._parser_internals.stateclass", "spacy.pipeline._parser_internals.transition_system", "spacy.tokenizer", - "spacy.gold.gold_io", + "spacy.training.gold_io", "spacy.tokens.doc", "spacy.tokens.span", "spacy.tokens.token", diff --git a/spacy/about.py b/spacy/about.py index 7d0e85a17..b8dc65455 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,7 +1,8 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a14" +__version__ = "3.0.0a16" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" -__projects__ = "https://github.com/explosion/spacy-boilerplates" +__projects__ = "https://github.com/explosion/projects" +__projects_branch__ = "v3" diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 0ecb5ad8f..360d2439a 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING +from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING import sys import shutil from pathlib import Path @@ -6,6 +6,7 @@ from wasabi import msg import srsly import hashlib import typer +import subprocess from click import NoSuchOption from typer.main import get_command from contextlib import contextmanager @@ -13,7 +14,7 @@ from thinc.config import Config, ConfigValidationError from configparser import InterpolationError from ..schemas import ProjectConfigSchema, validate -from ..util import import_file, run_command, make_tempdir +from ..util import import_file, run_command, make_tempdir, registry if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -54,6 +55,8 @@ app.add_typer(init_cli) def setup_cli() -> None: + # Make sure the entry-point for CLI runs, so that they get imported. + registry.cli.get_all() # Ensure that the help messages always display the correct prompt command = get_command(app) command(prog_name=COMMAND) @@ -318,33 +321,87 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m # *that* we can do by path. # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: + git_version = get_git_version() + supports_sparse = git_version >= (2, 22) # This is the "clone, but don't download anything" part. - cmd = ( - f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " - f"--filter=blob:none " # <-- The key bit - f"-b {branch}" - ) - run_command(cmd, capture=True) + cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} " + if supports_sparse: + cmd += f"--filter=blob:none" # <-- The key bit + else: + msg.warn( + f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " + f"that doesn't fully support sparse checkout yet. This means that " + f"more files than necessary may be downloaded temporarily. To " + f"only download the files needed, upgrade to Git v2.22 or above." + ) + _attempt_run_command(cmd) # Now we need to find the missing filenames for the subpath we want. # Looking for this 'rev-list' command in the git --help? Hah. - cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" - ret = run_command(cmd, capture=True) - repo = _from_http_to_git(repo) + cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}" + ret = _attempt_run_command(cmd) + git_repo = _from_http_to_git(repo) # Now pass those missings into another bit of git internals missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) - cmd = f"git -C {tmp_dir} fetch-pack {repo} {missings}" - run_command(cmd, capture=True) + if supports_sparse and not missings: + err = ( + f"Could not find any relevant files for '{subpath}'. " + f"Did you specify a correct and complete path within repo '{repo}' " + f"and branch {branch}?" + ) + msg.fail(err, exits=1) + if supports_sparse: + cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" + _attempt_run_command(cmd) # And finally, we can checkout our subpath cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" - run_command(cmd) + _attempt_run_command(cmd) # We need Path(name) to make sure we also support subdirectories shutil.move(str(tmp_dir / Path(subpath)), str(dest)) -def _from_http_to_git(repo): +def get_git_version() -> Tuple[int, int]: + ret = _attempt_run_command(["git", "--version"]) + # TODO: this seems kinda brittle? + version = ret.stdout[11:].strip().split(".") + return (int(version[0]), int(version[1])) + + +def _attempt_run_command(cmd: Union[str, List[str]]): + try: + return run_command(cmd, capture=True) + except subprocess.CalledProcessError as e: + err = f"Could not run command" + msg.fail(err) + print(cmd) + sys.exit(1) + + +def _from_http_to_git(repo: str) -> str: if repo.startswith("http://"): repo = repo.replace(r"http://", r"https://") if repo.startswith(r"https://"): repo = repo.replace("https://", "git@").replace("/", ":", 1) + if repo.endswith("/"): + repo = repo[:-1] repo = f"{repo}.git" return repo + + +def string_to_list(value, intify=False): + """Parse a comma-separated string to a list""" + if not value: + return [] + if value.startswith("[") and value.endswith("]"): + value = value[1:-1] + result = [] + for p in value.split(","): + p = p.strip() + if p.startswith("'") and p.endswith("'"): + p = p[1:-1] + if p.startswith('"') and p.endswith('"'): + p = p[1:-1] + p = p.strip() + if intify: + p = int(p) + result.append(p) + return result diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index ade5a3ad4..ad89b9976 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,9 +7,9 @@ import re import sys from ._util import app, Arg, Opt -from ..gold import docs_to_json +from ..training import docs_to_json from ..tokens import DocBin -from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs +from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs # Converters are matched by file extension except for ner/iob, which are diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 75a81e6f5..d52f30b82 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -8,7 +8,7 @@ import typer from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli, get_sourced_components -from ..gold import Corpus, Example +from ..training import Corpus, Example from ..pipeline._parser_internals import nonproj from ..language import Language from .. import util diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 5bd4e008f..1a250e43e 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -5,7 +5,7 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam from thinc.api import Model, data_validation import typer -from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides +from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list from .. import util @@ -38,12 +38,13 @@ def debug_model_cli( require_gpu(use_gpu) else: msg.info("Using CPU") + layers = string_to_list(layers, intify=True) print_settings = { "dimensions": dimensions, "parameters": parameters, "gradients": gradients, "attributes": attributes, - "layers": [int(x.strip()) for x in layers.split(",")] if layers else [], + "layers": layers, "print_before_training": P0, "print_after_init": P1, "print_after_training": P2, @@ -84,11 +85,11 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None _print_model(model, print_settings) # STEP 1: Initializing the model and printing again + X = _get_docs() Y = _get_output(model.ops.xp) - _set_output_dim(nO=Y.shape[-1], model=model) # The output vector might differ from the official type of the output layer with data_validation(False): - model.initialize(X=_get_docs(), Y=Y) + model.initialize(X=X, Y=Y) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) @@ -135,15 +136,6 @@ def _get_output(xp): return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32") -def _set_output_dim(model, nO): - # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx - if model.has_dim("nO") is None: - model.set_dim("nO", nO) - if model.has_ref("output_layer"): - if model.get_ref("output_layer").has_dim("nO") is None: - model.get_ref("output_layer").set_dim("nO", nO) - - def _print_model(model, print_settings): layers = print_settings.get("layers", "") parameters = print_settings.get("parameters", False) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index c5cbab09a..f9954d9ad 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -5,7 +5,7 @@ import re import srsly from thinc.api import require_gpu, fix_random_seed -from ..gold import Corpus +from ..training import Corpus from ..tokens import Doc from ._util import app, Arg, Opt from ..scorer import Scorer diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 584ca7f64..ec65b0e0a 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -9,7 +9,7 @@ import re from .. import util from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..schemas import RecommendationSchema -from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND +from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list ROOT = Path(__file__).parent / "templates" @@ -42,7 +42,7 @@ def init_config_cli( """ if isinstance(optimize, Optimizations): # instance of enum from the CLI optimize = optimize.value - pipeline = [p.strip() for p in pipeline.split(",")] + pipeline = string_to_list(pipeline) init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 5f06fd895..9eab7b54d 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -256,6 +256,7 @@ def add_vectors( def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int): f = open_file(vectors_loc) + f = ensure_shape(f) shape = tuple(int(size) for size in next(f).split()) if truncate_vectors >= 1: shape = (truncate_vectors, shape[1]) @@ -274,6 +275,31 @@ def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int): return vectors_data, vectors_keys +def ensure_shape(lines): + """Ensure that the first line of the data is the vectors shape. + + If it's not, we read in the data and output the shape as the first result, + so that the reader doesn't have to deal with the problem. + """ + first_line = next(lines) + try: + shape = tuple(int(size) for size in first_line.split()) + except ValueError: + shape = None + if shape is not None: + # All good, give the data + yield first_line + yield from lines + else: + # Figure out the shape, make it the first value, and then give the + # rest of the data. + width = len(first_line.split()) - 1 + captured = [first_line] + list(lines) + length = len(captured) + yield f"{length} {width}" + yield from captured + + def read_freqs( freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50 ): diff --git a/spacy/cli/package.py b/spacy/cli/package.py index c457b3e17..8d6cd84c1 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -18,6 +18,7 @@ def package_cli( output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), + name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"), version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"), force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"), @@ -38,6 +39,7 @@ def package_cli( input_dir, output_dir, meta_path=meta_path, + name=name, version=version, create_meta=create_meta, create_sdist=not no_sdist, @@ -50,6 +52,7 @@ def package( input_dir: Path, output_dir: Path, meta_path: Optional[Path] = None, + name: Optional[str] = None, version: Optional[str] = None, create_meta: bool = False, create_sdist: bool = True, @@ -71,6 +74,8 @@ def package( msg.fail("Can't load pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) meta = get_meta(input_dir, meta) + if name is not None: + meta["name"] = name if version is not None: meta["version"] = version if not create_meta: # only print if user doesn't want to overwrite diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 2b623675d..7326b2e5c 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -38,16 +38,21 @@ def project_assets(project_dir: Path) -> None: msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) msg.info(f"Fetching {len(assets)} asset(s)") for asset in assets: - dest = Path(asset["dest"]) + dest = (project_dir / asset["dest"]).resolve() checksum = asset.get("checksum") if "git" in asset: if dest.exists(): # If there's already a file, check for checksum if checksum and checksum == get_checksum(dest): - msg.good(f"Skipping download with matching checksum: {dest}") + msg.good( + f"Skipping download with matching checksum: {asset['dest']}" + ) continue else: - shutil.rmtree(dest) + if dest.is_dir(): + shutil.rmtree(dest) + else: + dest.unlink() git_sparse_checkout( asset["git"]["repo"], asset["git"]["path"], @@ -67,14 +72,16 @@ def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None: """Check and validate assets without a URL (private assets that the user has to provide themselves) and give feedback about the checksum. - dest (Path): Desintation path of the asset. + dest (Path): Destination path of the asset. checksum (Optional[str]): Optional checksum of the expected file. """ if not Path(dest).exists(): err = f"No URL provided for asset. You need to add this file yourself: {dest}" msg.warn(err) else: - if checksum and checksum == get_checksum(dest): + if not checksum: + msg.good(f"Asset already exists: {dest}") + elif checksum == get_checksum(dest): msg.good(f"Asset exists with matching checksum: {dest}") else: msg.fail(f"Asset available but with incorrect checksum: {dest}") diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index a419feb0f..ab617e4ba 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -16,6 +16,7 @@ def project_clone_cli( name: str = Arg(..., help="The name of the template to clone"), dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"), + branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from") # fmt: on ): """Clone a project template from a repository. Calls into "git" and will @@ -26,23 +27,30 @@ def project_clone_cli( DOCS: https://nightly.spacy.io/api/cli#project-clone """ if dest is None: - dest = Path.cwd() / name - project_clone(name, dest, repo=repo) + dest = Path.cwd() / Path(name).parts[-1] + project_clone(name, dest, repo=repo, branch=branch) -def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None: +def project_clone( + name: str, + dest: Path, + *, + repo: str = about.__projects__, + branch: str = about.__projects_branch__, +) -> None: """Clone a project template from a repository. name (str): Name of subdirectory to clone. dest (Path): Destination path of cloned project. repo (str): URL of Git repo containing project templates. + branch (str): The branch to clone from """ dest = ensure_path(dest) check_clone(name, dest, repo) project_dir = dest.resolve() repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) try: - git_sparse_checkout(repo, name, dest) + git_sparse_checkout(repo, name, dest, branch=branch) except subprocess.CalledProcessError: err = f"Could not clone '{name}' from repo '{repo_name}'" msg.fail(err, exits=1) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6be47fa39..0bc493e56 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,4 +1,5 @@ from typing import Optional, Dict, Any, Tuple, Union, Callable, List +from timeit import default_timer as timer import srsly import tqdm from pathlib import Path @@ -15,7 +16,7 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, get_sourced_components from ..language import Language from .. import util -from ..gold.example import Example +from ..training.example import Example from ..errors import Errors @@ -286,9 +287,12 @@ def train_while_improving( ] raw_batches = util.minibatch(raw_examples, size=8) + words_seen = 0 + start_time = timer() for step, (epoch, batch) in enumerate(train_data): dropout = next(dropouts) for subbatch in subdivide_batch(batch, accumulate_gradient): + nlp.update( subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude ) @@ -317,6 +321,7 @@ def train_while_improving( else: score, other_scores = (None, None) is_best_checkpoint = None + words_seen += sum(len(eg) for eg in batch) info = { "epoch": epoch, "step": step, @@ -324,6 +329,8 @@ def train_while_improving( "other_scores": other_scores, "losses": losses, "checkpoints": results, + "seconds": int(timer() - start_time), + "words": words_seen, } yield batch, info, is_best_checkpoint if is_best_checkpoint is not None: diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 9507f0f0a..7cd71453f 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -52,7 +52,7 @@ path = ${paths.train} # data is passed in sentence-by-sentence via some prior preprocessing. gold_preproc = false # Limitations on training document length -max_length = 2000 +max_length = 0 # Limitation on number of training examples limit = 0 @@ -64,7 +64,7 @@ path = ${paths.dev} # data is passed in sentence-by-sentence via some prior preprocessing. gold_preproc = false # Limitations on training document length -max_length = 2000 +max_length = 0 # Limitation on number of training examples limit = 0 @@ -88,9 +88,4 @@ L2 = 0.01 grad_clip = 1.0 use_averages = false eps = 1e-8 - -[training.optimizer.learn_rate] -@schedules = "warmup_linear.v1" -warmup_steps = 250 -total_steps = 20000 -initial_rate = 0.001 +learn_rate = 0.001 diff --git a/spacy/errors.py b/spacy/errors.py index bad3e83e4..7164598b6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -66,7 +66,7 @@ class Warnings: "in problems with the vocab further on in the pipeline.") W030 = ("Some entities could not be aligned in the text \"{text}\" with " "entities \"{entities}\". Use " - "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" + "`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" " to check the alignment. Misaligned entities ('-') will be " "ignored during training.") W033 = ("Training a new {model} using a model with no lexeme normalization " @@ -247,8 +247,8 @@ class Errors: "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}") E065 = ("Only one of the vector table's width and shape can be specified. " "Got width {width} and shape {shape}.") - E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside " - "an entity) without a preceding 'B' (beginning of an entity). " + E067 = ("Invalid BILUO tag sequence: Got a tag starting with {start} " + "without a preceding 'B' (beginning of an entity). " "Tag sequence:\n{tags}") E068 = ("Invalid BILUO tag: '{tag}'.") E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " @@ -320,10 +320,6 @@ class Errors: "So instead of pickling the span, pickle the Doc it belongs to or " "use Span.as_doc to convert the span to a standalone Doc object.") E115 = ("All subtokens must have associated heads.") - E116 = ("Cannot currently add labels to pretrained text classifier. Add " - "labels before training begins. This functionality was available " - "in previous versions, but had significant bugs that led to poor " - "performance.") E117 = ("The newly split tokens must match the text of the original token. " "New orths: {new}. Old text: {old}.") E118 = ("The custom extension attribute '{attr}' is not registered on the " @@ -378,8 +374,9 @@ class Errors: "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " "provided {found}.") - E143 = ("Labels for component '{name}' not initialized. Did you forget to " - "call add_label()?") + E143 = ("Labels for component '{name}' not initialized. This can be fixed " + "by calling add_label, or by providing a representative batch of " + "examples to the component's begin_training method.") E145 = ("Error reading `{param}` from input file.") E146 = ("Could not access `{path}`.") E147 = ("Unexpected error in the {method} functionality of the " @@ -483,6 +480,16 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E921 = ("The method 'set_output' can only be called on components that have " + "a Model with a 'resize_output' attribute. Otherwise, the output " + "layer can not be dynamically changed.") + E922 = ("Component '{name}' has been initialized with an output dimension of " + "{nO} - cannot add any more labels.") + E923 = ("It looks like there is no proper sample data to initialize the " + "Model of component '{name}'. " + "This is likely a bug in spaCy, so feel free to open an issue.") + E924 = ("The '{name}' component does not seem to be initialized properly. " + "This is likely a bug in spaCy, so feel free to open an issue.") E925 = ("Invalid color values for displaCy visualizer: expected dictionary " "mapping label names to colors but got: {obj}") E926 = ("It looks like you're trying to modify nlp.{attr} directly. This " diff --git a/spacy/language.py b/spacy/language.py index cd84e30a4..70dad59f3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -17,7 +17,7 @@ from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis -from .gold import Example, validate_examples +from .training import Example, validate_examples from .scorer import Scorer from .util import create_default_optimizer, registry, SimpleFrozenList from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER @@ -243,7 +243,8 @@ class Language: self._config["nlp"]["pipeline"] = list(self.component_names) self._config["nlp"]["disabled"] = list(self.disabled) self._config["components"] = pipeline - self._config["training"]["score_weights"] = combine_score_weights(score_weights) + if not self._config["training"].get("score_weights"): + self._config["training"]["score_weights"] = combine_score_weights(score_weights) if not srsly.is_json_serializable(self._config): raise ValueError(Errors.E961.format(config=self._config)) return self._config @@ -656,7 +657,7 @@ class Language: return resolved[factory_name] def create_pipe_from_source( - self, source_name: str, source: "Language", *, name: str, + self, source_name: str, source: "Language", *, name: str ) -> Tuple[Callable[[Doc], Doc], str]: """Create a pipeline component by copying it from an existing model. @@ -1155,21 +1156,24 @@ class Language: DOCS: https://nightly.spacy.io/api/language#begin_training """ - # TODO: throw warning when get_gold_tuples is provided instead of get_examples if get_examples is None: - get_examples = lambda: [] - else: # Populate vocab - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="Language", obj=type(get_examples)) + util.logger.debug( + "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples" + ) + doc = Doc(self.vocab, words=["x", "y", "z"]) + get_examples = lambda: [Example.from_dict(doc, {})] + # Populate vocab + if not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name="Language", obj=type(get_examples)) + raise ValueError(err) + for example in get_examples(): + if not isinstance(example, Example): + err = Errors.E978.format( + name="Language.begin_training", types=type(example) + ) raise ValueError(err) - for example in get_examples(): - if not isinstance(example, Example): - err = Errors.E978.format( - name="Language.begin_training", types=type(example) - ) - raise ValueError(err) - for word in [t.text for t in example.reference]: - _ = self.vocab[word] # noqa: F841 + for word in [t.text for t in example.reference]: + _ = self.vocab[word] # noqa: F841 if device >= 0: # TODO: do we need this here? require_gpu(device) if self.vocab.vectors.data.shape[1] >= 1: @@ -1187,7 +1191,7 @@ class Language: return self._optimizer def resume_training( - self, *, sgd: Optional[Optimizer] = None, device: int = -1, + self, *, sgd: Optional[Optimizer] = None, device: int = -1 ) -> Optimizer: """Continue training a pretrained model. diff --git a/spacy/ml/_biluo.py b/spacy/ml/_biluo.py deleted file mode 100644 index 5a66a35bd..000000000 --- a/spacy/ml/_biluo.py +++ /dev/null @@ -1,105 +0,0 @@ -"""Thinc layer to do simpler transition-based parsing, NER, etc.""" -from typing import Dict, Optional -import numpy -from thinc.api import Model -from thinc.types import Padded, Floats3d - - -def BILUO() -> Model[Padded, Padded]: - return Model( - "biluo", - forward, - init=init, - dims={"nO": None}, - attrs={"get_num_actions": get_num_actions}, - ) - - -def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): - if X is not None and Y is not None: - if X.data.shape != Y.data.shape: - # TODO: Fix error - raise ValueError("Mismatched shapes (TODO: Fix message)") - model.set_dim("nO", X.data.shape[2]) - elif X is not None: - model.set_dim("nO", X.data.shape[2]) - elif Y is not None: - model.set_dim("nO", Y.data.shape[2]) - elif model.get_dim("nO") is None: - raise ValueError("Dimension unset for BILUO: nO") - - -def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): - n_labels = (model.get_dim("nO") - 1) // 4 - n_tokens, n_docs, n_actions = Xp.data.shape - # At each timestep, we make a validity mask of shape (n_docs, n_actions) - # to indicate which actions are valid next for each sequence. To construct - # the mask, we have a state of shape (2, n_actions) and a validity table of - # shape (2, n_actions+1, n_actions). The first dimension of the state indicates - # whether it's the last token, the second dimension indicates the previous - # action, plus a special 'null action' for the first entry. - valid_transitions = model.ops.asarray(_get_transition_table(n_labels)) - prev_actions = model.ops.alloc1i(n_docs) - # Initialize as though prev action was O - prev_actions.fill(n_actions - 1) - Y = model.ops.alloc3f(*Xp.data.shape) - masks = model.ops.alloc3f(*Y.shape) - max_value = Xp.data.max() - for t in range(Xp.data.shape[0]): - is_last = (Xp.lengths < (t + 2)).astype("i") - masks[t] = valid_transitions[is_last, prev_actions] - # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t] :] = 0 - # Valid actions get 0*10e8, invalid get large negative value - Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10) - prev_actions = Y[t].argmax(axis=-1) - - def backprop_biluo(dY: Padded) -> Padded: - dY.data *= masks - return dY - - return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo - - -def get_num_actions(n_labels: int) -> int: - # One BEGIN action per label - # One IN action per label - # One LAST action per label - # One UNIT action per label - # One OUT action - return n_labels + n_labels + n_labels + n_labels + 1 - - -def _get_transition_table( - n_labels: int, *, _cache: Dict[int, Floats3d] = {} -) -> Floats3d: - n_actions = get_num_actions(n_labels) - if n_actions in _cache: - return _cache[n_actions] - table = numpy.zeros((2, n_actions, n_actions), dtype="f") - B_start, B_end = (0, n_labels) - I_start, I_end = (B_end, B_end + n_labels) - L_start, L_end = (I_end, I_end + n_labels) - U_start, _ = (L_end, L_end + n_labels) # noqa: F841 - # Using ranges allows us to set specific cells, which is necessary to express - # that only actions of the same label are valid continuations. - B_range = numpy.arange(B_start, B_end) - I_range = numpy.arange(I_start, I_end) - L_range = numpy.arange(L_start, L_end) - # If this is the last token and the previous action was B or I, only L - # of that label is valid - table[1, B_range, L_range] = 1 - table[1, I_range, L_range] = 1 - # If this isn't the last token and the previous action was B or I, only I or - # L of that label are valid. - table[0, B_range, I_range] = 1 - table[0, B_range, L_range] = 1 - table[0, I_range, I_range] = 1 - table[0, I_range, L_range] = 1 - # If this isn't the last token and the previous was L, U or O, B is valid - table[0, L_start:, :B_end] = 1 - # Regardless of whether this is the last token, if the previous action was - # {L, U, O}, U and O are valid. - table[:, L_start:, U_start:] = 1 - _cache[n_actions] = table - return table diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py deleted file mode 100644 index 4dbc79f52..000000000 --- a/spacy/ml/_iob.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Thinc layer to do simpler transition-based parsing, NER, etc.""" -from typing import Dict, Optional -from thinc.api import Ops, Model -from thinc.types import Padded, Floats3d - - -def IOB() -> Model[Padded, Padded]: - return Model( - "biluo", - forward, - init=init, - dims={"nO": None}, - attrs={"get_num_actions": get_num_actions}, - ) - - -def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None: - if X is not None and Y is not None: - if X.data.shape != Y.data.shape: - # TODO: Fix error - raise ValueError("Mismatched shapes (TODO: Fix message)") - model.set_dim("nO", X.data.shape[2]) - elif X is not None: - model.set_dim("nO", X.data.shape[2]) - elif Y is not None: - model.set_dim("nO", Y.data.shape[2]) - elif model.get_dim("nO") is None: - raise ValueError("Dimension unset for BILUO: nO") - - -def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): - n_labels = (model.get_dim("nO") - 1) // 2 - n_tokens, n_docs, n_actions = Xp.data.shape - # At each timestep, we make a validity mask of shape (n_docs, n_actions) - # to indicate which actions are valid next for each sequence. To construct - # the mask, we have a state of shape (2, n_actions) and a validity table of - # shape (2, n_actions+1, n_actions). The first dimension of the state indicates - # whether it's the last token, the second dimension indicates the previous - # action, plus a special 'null action' for the first entry. - valid_transitions = _get_transition_table(model.ops, n_labels) - prev_actions = model.ops.alloc1i(n_docs) - # Initialize as though prev action was O - prev_actions.fill(n_actions - 1) - Y = model.ops.alloc3f(*Xp.data.shape) - masks = model.ops.alloc3f(*Y.shape) - for t in range(Xp.data.shape[0]): - masks[t] = valid_transitions[prev_actions] - # Don't train the out-of-bounds sequences. - masks[t, Xp.size_at_t[t] :] = 0 - # Valid actions get 0*10e8, invalid get -1*10e8 - Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8) - prev_actions = Y[t].argmax(axis=-1) - - def backprop_biluo(dY: Padded) -> Padded: - # Masking the gradient seems to do poorly here. But why? - # dY.data *= masks - return dY - - return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo - - -def get_num_actions(n_labels: int) -> int: - # One BEGIN action per label - # One IN action per label - # One LAST action per label - # One UNIT action per label - # One OUT action - return n_labels * 2 + 1 - - -def _get_transition_table( - ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {} -) -> Floats3d: - n_actions = get_num_actions(n_labels) - if n_actions in _cache: - return ops.asarray(_cache[n_actions]) - table = ops.alloc2f(n_actions, n_actions) - B_start, B_end = (0, n_labels) - I_start, I_end = (B_end, B_end + n_labels) - O_action = I_end - B_range = ops.xp.arange(B_start, B_end) - I_range = ops.xp.arange(I_start, I_end) - # B and O are always valid - table[:, B_start:B_end] = 1 - table[:, O_action] = 1 - # I can only follow a matching B - table[B_range, I_range] = 1 - - _cache[n_actions] = table - return table diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index dd58dab00..67e70421f 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -1,6 +1,5 @@ from .entity_linker import * # noqa from .parser import * # noqa -from .simple_ner import * # noqa from .tagger import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py deleted file mode 100644 index aca58c937..000000000 --- a/spacy/ml/models/simple_ner.py +++ /dev/null @@ -1,104 +0,0 @@ -from typing import List -from thinc.api import Model, Linear, with_array, softmax_activation, padded2list -from thinc.api import chain, list2padded, configure_normal_init -from thinc.api import Dropout -from thinc.types import Floats2d - -from ...tokens import Doc -from .._biluo import BILUO -from .._iob import IOB -from ...util import registry - - -@registry.architectures.register("spacy.BILUOTagger.v1") -def BiluoTagger( - tok2vec: Model[List[Doc], List[Floats2d]] -) -> Model[List[Doc], List[Floats2d]]: - """Construct a simple NER tagger, that predicts BILUO tag scores for each - token and uses greedy decoding with transition-constraints to return a valid - BILUO tag sequence. - - A BILUO tag sequence encodes a sequence of non-overlapping labelled spans - into tags assigned to each token. The first token of a span is given the - tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens - within the span are given the tag U-LABEL. Single-token spans are given - the tag U-LABEL. All other tokens are assigned the tag O. - - The BILUO tag scheme generally results in better linear separation between - classes, especially for non-CRF models, because there are more distinct classes - for the different situations (Ratinov et al., 2009). - """ - biluo = BILUO() - linear = Linear( - nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02) - ) - model = chain( - tok2vec, - list2padded(), - with_array(chain(Dropout(0.1), linear)), - biluo, - with_array(softmax_activation()), - padded2list(), - ) - return Model( - "biluo-tagger", - forward, - init=init, - layers=[model, linear], - refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, - dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, - ) - - -@registry.architectures.register("spacy.IOBTagger.v1") -def IOBTagger( - tok2vec: Model[List[Doc], List[Floats2d]] -) -> Model[List[Doc], List[Floats2d]]: - """Construct a simple NER tagger, that predicts IOB tag scores for each - token and uses greedy decoding with transition-constraints to return a valid - IOB tag sequence. - - An IOB tag sequence encodes a sequence of non-overlapping labelled spans - into tags assigned to each token. The first token of a span is given the - tag B-LABEL, and subsequent tokens are given the tag I-LABEL. - All other tokens are assigned the tag O. - """ - biluo = IOB() - linear = Linear(nO=None, nI=tok2vec.get_dim("nO")) - model = chain( - tok2vec, - list2padded(), - with_array(linear), - biluo, - with_array(softmax_activation()), - padded2list(), - ) - return Model( - "iob-tagger", - forward, - init=init, - layers=[model], - refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, - dims={"nO": None}, - attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, - ) - - -def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: - if model.get_dim("nO") is None and Y: - model.set_dim("nO", Y[0].shape[1]) - nO = model.get_dim("nO") - biluo = model.get_ref("biluo") - linear = model.get_ref("linear") - biluo.set_dim("nO", nO) - if linear.has_dim("nO") is None: - linear.set_dim("nO", nO) - model.layers[0].initialize(X=X, Y=Y) - - -def forward(model: Model, X: List[Doc], is_train: bool): - return model.layers[0](X, is_train) - - -__all__ = ["BiluoTagger"] diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index faa5350d4..2e5f8a802 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -165,7 +165,7 @@ def MultiHashEmbed( @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed(width: int, rows: int, nM: int, nC: int): - """Construct an embedded representations based on character embeddings, using + """Construct an embedded representation based on character embeddings, using a feed-forward network. A fixed number of UTF-8 byte characters are used for each word, taken from the beginning and end of the word equally. Padding is used in the centre for words that are too short. @@ -176,8 +176,8 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int): ensures that the final character is always in the last position, instead of being in an arbitrary position depending on the word length. - The characters are embedded in a embedding table with 256 rows, and the - vectors concatenated. A hash-embedded vector of the NORM of the word is + The characters are embedded in a embedding table with a given number of rows, + and the vectors concatenated. A hash-embedded vector of the NORM of the word is also concatenated on, and the result is then passed through a feed-forward network to construct a single vector to represent the information. diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 793aa83c3..656182088 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -8,7 +8,6 @@ from .morphologizer import Morphologizer from .pipe import Pipe from .senter import SentenceRecognizer from .sentencizer import Sentencizer -from .simple_ner import SimpleNER from .tagger import Tagger from .textcat import TextCategorizer from .tok2vec import Tok2Vec @@ -25,7 +24,6 @@ __all__ = [ "Pipe", "SentenceRecognizer", "Sentencizer", - "SimpleNER", "Tagger", "TextCategorizer", "Tok2Vec", diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 7db8aae0f..bb0bf35b8 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -8,7 +8,7 @@ from ...typedefs cimport hash_t, attr_t from ...strings cimport hash_string from ...structs cimport TokenC from ...tokens.doc cimport Doc, set_children_from_heads -from ...gold.example cimport Example +from ...training.example cimport Example from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx index 2570ccdee..0351bcaf7 100644 --- a/spacy/pipeline/_parser_internals/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -5,7 +5,7 @@ from cymem.cymem cimport Pool from ...typedefs cimport weight_t, attr_t from ...lexeme cimport Lexeme from ...attrs cimport IS_SPACE -from ...gold.example cimport Example +from ...training.example cimport Example from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd index ba4c33814..458f1d5f9 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pxd +++ b/spacy/pipeline/_parser_internals/transition_system.pxd @@ -3,7 +3,7 @@ from cymem.cymem cimport Pool from ...typedefs cimport attr_t, weight_t from ...structs cimport TokenC from ...strings cimport StringStore -from ...gold.example cimport Example +from ...training.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 406112681..f64fcbc54 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -4,7 +4,7 @@ from pathlib import Path from .pipe import Pipe from ..errors import Errors -from ..gold import validate_examples +from ..training import validate_examples from ..language import Language from ..matcher import Matcher from ..scorer import Scorer diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index eee4ed535..edd791e40 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -9,7 +9,7 @@ from .functions import merge_subtokens from ..language import Language from ._parser_internals import nonproj from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples default_model_config = """ diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index d4f1e6b56..1debadd82 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,3 +1,4 @@ +from itertools import islice from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple from pathlib import Path import srsly @@ -11,7 +12,7 @@ from ..tokens import Doc from .pipe import Pipe, deserialize_config from ..language import Language from ..vocab import Vocab -from ..gold import Example, validate_examples +from ..training import Example, validate_examples from ..errors import Errors, Warnings from ..util import SimpleFrozenList from .. import util @@ -128,7 +129,7 @@ class EntityLinker(Pipe): # how many neightbour sentences to take into account self.n_sents = cfg.get("n_sents", 0) - def require_kb(self) -> None: + def _require_kb(self) -> None: # Raise an error if the knowledge base is not initialized. if len(self.kb) == 0: raise ValueError(Errors.E139.format(name=self.name)) @@ -140,10 +141,11 @@ class EntityLinker(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ) -> Optimizer: - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -153,10 +155,19 @@ class EntityLinker(Pipe): DOCS: https://nightly.spacy.io/api/entitylinker#begin_training """ - self.require_kb() + self._ensure_examples(get_examples) + self._require_kb() nO = self.kb.entity_vector_length - self.set_output(nO) - self.model.initialize() + doc_sample = [] + vector_sample = [] + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + vector_sample.append(self.model.ops.alloc1f(nO)) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(vector_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize( + X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") + ) if sgd is None: sgd = self.create_optimizer() return sgd @@ -184,7 +195,7 @@ class EntityLinker(Pipe): DOCS: https://nightly.spacy.io/api/entitylinker#update """ - self.require_kb() + self._require_kb() if losses is None: losses = {} losses.setdefault(self.name, 0.0) @@ -296,7 +307,7 @@ class EntityLinker(Pipe): DOCS: https://nightly.spacy.io/api/entitylinker#predict """ - self.require_kb() + self._require_kb() entity_count = 0 final_kb_ids = [] if not docs: @@ -405,7 +416,7 @@ class EntityLinker(Pipe): token.ent_kb_id_ = kb_id def to_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: """Serialize the pipe to disk. @@ -422,7 +433,7 @@ class EntityLinker(Pipe): util.to_disk(path, serialize, exclude) def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> "EntityLinker": """Load the pipe from disk. Modifies the object in place and returns it. diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 4f4ff230e..24bbb067f 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -9,7 +9,7 @@ from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples DEFAULT_ENT_ID_SEP = "||" diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 3f3e387b7..0fd3482c4 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -8,7 +8,7 @@ from ..lookups import Lookups, load_lookups from ..scorer import Scorer from ..tokens import Doc, Token from ..vocab import Vocab -from ..gold import validate_examples +from ..training import validate_examples from .. import util diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index bcb555b90..57bdb28d7 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -2,6 +2,7 @@ from typing import Optional import srsly from thinc.api import SequenceCategoricalCrossentropy, Model, Config +from itertools import islice from ..tokens.doc cimport Doc from ..vocab cimport Vocab @@ -15,7 +16,7 @@ from .pipe import deserialize_config from .tagger import Tagger from .. import util from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples default_model_config = """ @@ -112,6 +113,7 @@ class Morphologizer(Tagger): raise ValueError(Errors.E187) if label in self.labels: return 0 + self._allow_extra_label() # normalize label norm_label = self.vocab.morphology.normalize_features(label) # extract separate POS and morph tags @@ -128,10 +130,11 @@ class Morphologizer(Tagger): return 1 def begin_training(self, get_examples, *, pipeline=None, sgd=None): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -141,9 +144,8 @@ class Morphologizer(Tagger): DOCS: https://nightly.spacy.io/api/morphologizer#begin_training """ - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="Morphologizer", obj=type(get_examples)) - raise ValueError(err) + self._ensure_examples(get_examples) + # First, fetch all labels from the data for example in get_examples(): for i, token in enumerate(example.reference): pos = token.pos_ @@ -157,8 +159,25 @@ class Morphologizer(Tagger): if norm_label not in self.cfg["labels_morph"]: self.cfg["labels_morph"][norm_label] = morph self.cfg["labels_pos"][norm_label] = POS_IDS[pos] - self.set_output(len(self.labels)) - self.model.initialize() + if len(self.labels) <= 1: + raise ValueError(Errors.E143.format(name=self.name)) + doc_sample = [] + label_sample = [] + for example in islice(get_examples(), 10): + gold_array = [] + for i, token in enumerate(example.reference): + pos = token.pos_ + morph = token.morph_ + morph_dict = Morphology.feats_to_dict(morph) + if pos: + morph_dict[self.POS_FEAT] = pos + norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] + gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels]) + doc_sample.append(example.x) + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 3ef85c821..2f8940124 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -8,7 +8,7 @@ from ..tokens.doc cimport Doc from .pipe import Pipe from .tagger import Tagger -from ..gold import validate_examples +from ..training import validate_examples from ..language import Language from ._parser_internals import nonproj from ..attrs import POS, ID @@ -90,7 +90,7 @@ class MultitaskObjective(Tagger): label = self.make_label(token) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) - self.model.initialize() + self.model.initialize() # TODO: fix initialization by defining X and Y if sgd is None: sgd = self.create_optimizer() return sgd @@ -178,7 +178,7 @@ class ClozeMultitask(Pipe): pass def begin_training(self, get_examples, pipeline=None, sgd=None): - self.model.initialize() + self.model.initialize() # TODO: fix initialization by defining X and Y X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.begin_training(X) if sgd is None: diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index d9f33ccb4..2fa5c6392 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -7,7 +7,7 @@ from ._parser_internals.ner cimport BiluoPushDown from ..language import Language from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples default_model_config = """ diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 2518ebad3..324c8e19c 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -4,7 +4,7 @@ from thinc.api import set_dropout_rate, Model from ..tokens.doc cimport Doc -from ..gold import validate_examples +from ..training import validate_examples from ..errors import Errors from .. import util @@ -160,6 +160,20 @@ cdef class Pipe: """ raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) + + def _require_labels(self) -> None: + """Raise an error if the component's model has no labels defined.""" + if not self.labels or list(self.labels) == [""]: + raise ValueError(Errors.E143.format(name=self.name)) + + + def _allow_extra_label(self) -> None: + """Raise an error if the component can not add any more labels.""" + if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels): + if not self.is_resizable(): + raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))) + + def create_optimizer(self): """Create an optimizer for the pipeline component. @@ -171,9 +185,12 @@ cdef class Pipe: def begin_training(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using data examples if available. + This method needs to be implemented by each Pipe component, + ensuring the internal model (if available) is initialized properly + using the provided sample of Example objects. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -183,16 +200,24 @@ cdef class Pipe: DOCS: https://nightly.spacy.io/api/pipe#begin_training """ - self.model.initialize() - if sgd is None: - sgd = self.create_optimizer() - return sgd + raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) + + def _ensure_examples(self, get_examples): + if get_examples is None or not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name=self.name, obj=type(get_examples)) + raise ValueError(err) + if not get_examples(): + err = Errors.E930.format(name=self.name, obj=get_examples()) + raise ValueError(err) + + def is_resizable(self): + return hasattr(self, "model") and "resize_output" in self.model.attrs def set_output(self, nO): - if self.model.has_dim("nO") is not False: - self.model.set_dim("nO", nO) - if self.model.has_ref("output_layer"): - self.model.get_ref("output_layer").set_dim("nO", nO) + if self.is_resizable(): + self.model.attrs["resize_output"](self.model, nO) + else: + raise NotImplementedError(Errors.E921) def use_params(self, params): """Modify the pipe's model, to use the given parameter values. At the diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index aaf08d594..5700c2b98 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc from .pipe import Pipe from ..language import Language from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples from .. import util diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index b78be44f8..00664131b 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,4 +1,6 @@ # cython: infer_types=True, profile=True, binding=True +from itertools import islice + import srsly from thinc.api import Model, SequenceCategoricalCrossentropy, Config @@ -9,7 +11,7 @@ from .tagger import Tagger from ..language import Language from ..errors import Errors from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples from .. import util @@ -124,10 +126,11 @@ class SentenceRecognizer(Tagger): return float(loss), d_scores def begin_training(self, get_examples, *, pipeline=None, sgd=None): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -137,8 +140,18 @@ class SentenceRecognizer(Tagger): DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training """ - self.set_output(len(self.labels)) - self.model.initialize() + self._ensure_examples(get_examples) + doc_sample = [] + label_sample = [] + assert self.labels, Errors.E924.format(name=self.name) + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + gold_tags = example.get_aligned("SENT_START") + gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags] + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py deleted file mode 100644 index c55edb067..000000000 --- a/spacy/pipeline/simple_ner.py +++ /dev/null @@ -1,211 +0,0 @@ -from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set -from thinc.types import Floats2d -from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model -from thinc.api import Optimizer, Config -from thinc.util import to_numpy - -from ..errors import Errors -from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob -from ..gold import validate_examples -from ..tokens import Doc -from ..language import Language -from ..vocab import Vocab -from ..scorer import Scorer -from .pipe import Pipe - - -default_model_config = """ -[model] -@architectures = "spacy.BILUOTagger.v1" - -[model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = null -width = 128 -depth = 4 -embed_size = 7000 -window_size = 1 -maxout_pieces = 3 -subword_features = true -""" -DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"] - - -@Language.factory( - "simple_ner", - assigns=["doc.ents"], - default_config={"labels": [], "model": DEFAULT_SIMPLE_NER_MODEL}, - scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, -) -def make_simple_ner( - nlp: Language, name: str, model: Model, labels: Iterable[str] -) -> "SimpleNER": - return SimpleNER(nlp.vocab, model, name, labels=labels) - - -class SimpleNER(Pipe): - """Named entity recognition with a tagging model. The model should include - validity constraints to ensure that only valid tag sequences are returned.""" - - def __init__( - self, - vocab: Vocab, - model: Model, - name: str = "simple_ner", - *, - labels: Iterable[str], - ) -> None: - self.vocab = vocab - self.model = model - self.name = name - self.cfg = {"labels": []} - for label in labels: - self.add_label(label) - self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), normalize=True, missing_value=None - ) - assert self.model is not None - - @property - def is_biluo(self) -> bool: - return self.model.name.startswith("biluo") - - @property - def labels(self) -> Tuple[str]: - return tuple(self.cfg["labels"]) - - def add_label(self, label: str) -> None: - """Add a new label to the pipe. - label (str): The label to add. - DOCS: https://nightly.spacy.io/api/simplener#add_label - """ - if not isinstance(label, str): - raise ValueError(Errors.E187) - if label not in self.labels: - self.cfg["labels"].append(label) - self.vocab.strings.add(label) - - def get_tag_names(self) -> List[str]: - if self.is_biluo: - return ( - [f"B-{label}" for label in self.labels] - + [f"I-{label}" for label in self.labels] - + [f"L-{label}" for label in self.labels] - + [f"U-{label}" for label in self.labels] - + ["O"] - ) - else: - return ( - [f"B-{label}" for label in self.labels] - + [f"I-{label}" for label in self.labels] - + ["O"] - ) - - def predict(self, docs: List[Doc]) -> List[Floats2d]: - scores = self.model.predict(docs) - return scores - - def set_annotations(self, docs: List[Doc], scores: List[Floats2d]) -> None: - """Set entities on a batch of documents from a batch of scores.""" - tag_names = self.get_tag_names() - for i, doc in enumerate(docs): - actions = to_numpy(scores[i].argmax(axis=1)) - tags = [tag_names[actions[j]] for j in range(len(doc))] - if not self.is_biluo: - tags = iob_to_biluo(tags) - doc.ents = spans_from_biluo_tags(doc, tags) - - def update( - self, - examples: List[Example], - *, - set_annotations: bool = False, - drop: float = 0.0, - sgd: Optional[Optimizer] = None, - losses: Optional[Dict[str, float]] = None, - ) -> Dict[str, float]: - if losses is None: - losses = {} - losses.setdefault("ner", 0.0) - validate_examples(examples, "SimpleNER.update") - if not any(_has_ner(eg) for eg in examples): - return losses - docs = [eg.predicted for eg in examples] - set_dropout_rate(self.model, drop) - scores, bp_scores = self.model.begin_update(docs) - loss, d_scores = self.get_loss(examples, scores) - bp_scores(d_scores) - if set_annotations: - self.set_annotations(docs, scores) - if sgd is not None: - self.model.finish_update(sgd) - losses["ner"] += loss - return losses - - def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]: - validate_examples(examples, "SimpleNER.get_loss") - truths = [] - for eg in examples: - tags = eg.get_aligned_ner() - gold_tags = [(tag if tag != "-" else None) for tag in tags] - if not self.is_biluo: - gold_tags = biluo_to_iob(gold_tags) - truths.append(gold_tags) - for i in range(len(scores)): - if len(scores[i]) != len(truths[i]): - raise ValueError( - f"Mismatched output and gold sizes.\n" - f"Output: {len(scores[i])}, gold: {len(truths[i])}." - f"Input: {len(examples[i].doc)}" - ) - d_scores, loss = self.loss_func(scores, truths) - return loss, d_scores - - def begin_training( - self, - get_examples: Callable[[], Iterable[Example]], - pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, - sgd: Optional[Optimizer] = None, - ): - all_labels = set() - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="SimpleNER", obj=type(get_examples)) - raise ValueError(err) - for example in get_examples(): - all_labels.update(_get_labels(example)) - for label in sorted(all_labels): - self.add_label(label) - labels = self.labels - n_actions = self.model.attrs["get_num_actions"](len(labels)) - self.model.set_dim("nO", n_actions) - self.model.initialize() - if pipeline is not None: - self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) - self.loss_func = SequenceCategoricalCrossentropy( - names=self.get_tag_names(), normalize=True, missing_value=None - ) - return sgd - - def init_multitask_objectives(self, *args, **kwargs): - pass - - def score(self, examples, **kwargs): - validate_examples(examples, "SimpleNER.score") - return Scorer.score_spans(examples, "ents", **kwargs) - - -def _has_ner(example: Example) -> bool: - for ner_tag in example.get_aligned_ner(): - if ner_tag != "-" and ner_tag is not None: - return True - else: - return False - - -def _get_labels(example: Example) -> Set[str]: - labels = set() - for ner_tag in example.get_aligned("ENT_TYPE", as_string=True): - if ner_tag != "O" and ner_tag != "-": - labels.add(ner_tag) - return labels diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 2b760c878..1f8b4eb7a 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -5,6 +5,7 @@ import srsly from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config from thinc.types import Floats2d import warnings +from itertools import islice from ..tokens.doc cimport Doc from ..morphology cimport Morphology @@ -16,7 +17,7 @@ from ..attrs import POS, ID from ..parts_of_speech import X from ..errors import Errors, TempErrors, Warnings from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples from .. import util @@ -258,10 +259,11 @@ class Tagger(Pipe): return float(loss), d_scores def begin_training(self, get_examples, *, pipeline=None, sgd=None): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects.. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -271,32 +273,24 @@ class Tagger(Pipe): DOCS: https://nightly.spacy.io/api/tagger#begin_training """ - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="Tagger", obj=type(get_examples)) - raise ValueError(err) - tags = set() + self._ensure_examples(get_examples) doc_sample = [] + label_sample = [] + tags = set() for example in get_examples(): for token in example.y: - tags.add(token.tag_) - if len(doc_sample) < 10: - doc_sample.append(example.x) - if not doc_sample: - doc_sample.append(Doc(self.vocab, words=["hello"])) + if token.tag_: + tags.add(token.tag_) for tag in sorted(tags): self.add_label(tag) - if len(self.labels) == 0: - err = Errors.E1006.format(name="Tagger") - raise ValueError(err) - self.set_output(len(self.labels)) - if doc_sample: - label_sample = [ - self.model.ops.alloc2f(len(doc), len(self.labels)) - for doc in doc_sample - ] - self.model.initialize(X=doc_sample, Y=label_sample) - else: - self.model.initialize() + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + gold_tags = example.get_aligned("TAG", as_string=True) + gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags] + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd @@ -313,6 +307,7 @@ class Tagger(Pipe): raise ValueError(Errors.E187) if label in self.labels: return 0 + self._allow_extra_label() self.cfg["labels"].append(label) self.vocab.strings.add(label) return 1 diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index d6efb4348..4be6f580d 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,3 +1,4 @@ +from itertools import islice from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config from thinc.types import Floats2d @@ -5,7 +6,7 @@ import numpy from .pipe import Pipe from ..language import Language -from ..gold import Example, validate_examples +from ..training import Example, validate_examples from ..errors import Errors from ..scorer import Scorer from .. import util @@ -128,11 +129,6 @@ class TextCategorizer(Pipe): """ return tuple(self.cfg.setdefault("labels", [])) - def require_labels(self) -> None: - """Raise an error if the component's model has no labels defined.""" - if not self.labels: - raise ValueError(Errors.E143.format(name=self.name)) - @labels.setter def labels(self, value: Iterable[str]) -> None: self.cfg["labels"] = tuple(value) @@ -311,17 +307,7 @@ class TextCategorizer(Pipe): raise ValueError(Errors.E187) if label in self.labels: return 0 - if self.model.has_dim("nO"): - # This functionality was available previously, but was broken. - # The problem is that we resize the last layer, but the last layer - # is actually just an ensemble. We're not resizing the child layers - # - a huge problem. - raise ValueError(Errors.E116) - # smaller = self.model._layers[-1] - # larger = Linear(len(self.labels)+1, smaller.nI) - # copy_array(larger.W[:smaller.nO], smaller.W) - # copy_array(larger.b[:smaller.nO], smaller.b) - # self.model._layers[-1] = larger + self._allow_extra_label() self.labels = tuple(list(self.labels) + [label]) return 1 @@ -332,10 +318,11 @@ class TextCategorizer(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ) -> Optimizer: - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -345,22 +332,19 @@ class TextCategorizer(Pipe): DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training """ - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples)) - raise ValueError(err) + self._ensure_examples(get_examples) subbatch = [] # Select a subbatch of examples to initialize the model - for example in get_examples(): + for example in islice(get_examples(), 10): if len(subbatch) < 2: subbatch.append(example) for cat in example.y.cats: self.add_label(cat) - self.require_labels() - docs = [eg.reference for eg in subbatch] - if not docs: # need at least one doc - docs = [Doc(self.vocab, words=["hello"])] - truths, _ = self._examples_to_truth(subbatch) - self.set_output(len(self.labels)) - self.model.initialize(X=docs, Y=truths) + doc_sample = [eg.reference for eg in subbatch] + label_sample, _ = self._examples_to_truth(subbatch) + self._require_labels() + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 5657d687d..721c67a19 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,8 +1,9 @@ from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple from thinc.api import Model, set_dropout_rate, Optimizer, Config +from itertools import islice from .pipe import Pipe -from ..gold import Example, validate_examples +from ..training import Example, validate_examples from ..tokens import Doc from ..vocab import Vocab from ..language import Language @@ -209,10 +210,11 @@ class Tok2Vec(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -222,8 +224,12 @@ class Tok2Vec(Pipe): DOCS: https://nightly.spacy.io/api/tok2vec#begin_training """ - docs = [Doc(self.vocab, words=["hello"])] - self.model.initialize(X=docs) + self._ensure_examples(get_examples) + doc_sample = [] + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + assert doc_sample, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample) def add_label(self, label): raise NotImplementedError diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 5a6b491e0..1350e1f12 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -21,7 +21,7 @@ from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ..ml.parser_model cimport get_c_weights, get_c_sizes from ..tokens.doc cimport Doc -from ..gold import validate_examples +from ..training import validate_examples from ..errors import Errors, Warnings from .. import util @@ -244,7 +244,7 @@ cdef class Parser(Pipe): int nr_class, int batch_size) nogil: # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc with gil: - assert self.moves.n_moves > 0 + assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) is_valid = calloc(self.moves.n_moves, sizeof(int)) cdef int i, guess cdef Transition action @@ -378,7 +378,7 @@ cdef class Parser(Pipe): cdef int i # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc - assert self.moves.n_moves > 0 + assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) costs = mem.alloc(self.moves.n_moves, sizeof(float)) @@ -406,9 +406,7 @@ cdef class Parser(Pipe): self.model.attrs["resize_output"](self.model, nO) def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="DependencyParser/EntityRecognizer", obj=type(get_examples)) - raise ValueError(err) + self._ensure_examples(get_examples) self.cfg.update(kwargs) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: @@ -430,9 +428,6 @@ cdef class Parser(Pipe): if sgd is None: sgd = self.create_optimizer() doc_sample = [] - for example in islice(get_examples(), 10): - doc_sample.append(example.predicted) - if pipeline is not None: for name, component in pipeline: if component is self: @@ -441,10 +436,11 @@ cdef class Parser(Pipe): doc_sample = list(component.pipe(doc_sample, batch_size=8)) else: doc_sample = [component(doc) for doc in doc_sample] - if doc_sample: - self.model.initialize(doc_sample) - else: - self.model.initialize() + if not doc_sample: + for example in islice(get_examples(), 10): + doc_sample.append(example.predicted) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(doc_sample) if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) return sgd diff --git a/spacy/schemas.py b/spacy/schemas.py index 59af53301..38f47c668 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -12,7 +12,7 @@ from .attrs import NAMES if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from .language import Language # noqa: F401 - from .gold import Example # noqa: F401 + from .training import Example # noqa: F401 ItemT = TypeVar("ItemT") @@ -180,7 +180,7 @@ class ModelMetaSchema(BaseModel): url: StrictStr = Field("", title="Model author URL") sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources") vectors: Dict[str, Any] = Field({}, title="Included word vectors") - labels: Dict[str, Dict[str, List[str]]] = Field({}, title="Component labels, keyed by component name") + labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name") accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers") speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers") spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used") diff --git a/spacy/scorer.py b/spacy/scorer.py index 9b1831a91..7f7418237 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,7 +1,7 @@ from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING import numpy as np -from .gold import Example +from .training import Example from .tokens import Token, Doc, Span from .errors import Errors from .util import get_lang_class, SimpleFrozenList diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index d6e345336..751bd36d4 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -1,5 +1,6 @@ +from spacy.training import Example from spacy.pipeline import EntityRecognizer -from spacy.tokens import Span +from spacy.tokens import Span, Doc from spacy import registry import pytest @@ -7,6 +8,12 @@ from ..util import get_doc from spacy.pipeline.ner import DEFAULT_NER_MODEL +def _ner_example(ner): + doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"]) + gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]} + return Example.from_dict(doc, gold) + + def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) @@ -18,10 +25,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training(lambda: []) + ner.begin_training(lambda: [_ner_example(ner)]) ner(doc) - assert len(list(doc.ents)) == 0 - assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] @@ -31,6 +36,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): + """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) config = { @@ -41,11 +47,11 @@ def test_ents_reset(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training(lambda: []) + ner.begin_training(lambda: [_ner_example(ner)]) ner(doc) - assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) + orig_iobs = [t.ent_iob_ for t in doc] doc.ents = list(doc.ents) - assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) + assert [t.ent_iob_ for t in doc] == orig_iobs def test_add_overlapping_entities(en_vocab): diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index fce5f679f..0da42daa2 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -3,7 +3,7 @@ from thinc.api import Adam, fix_random_seed from spacy import registry from spacy.attrs import NORM from spacy.vocab import Vocab -from spacy.gold import Example +from spacy.training import Example from spacy.tokens import Doc from spacy.pipeline import DependencyParser, EntityRecognizer from spacy.pipeline.ner import DEFAULT_NER_MODEL @@ -35,7 +35,7 @@ def test_init_parser(parser): def _train_parser(parser): fix_random_seed(1) parser.add_label("left") - parser.begin_training(lambda: [], **parser.cfg) + parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) sgd = Adam(0.001) for i in range(5): @@ -47,16 +47,25 @@ def _train_parser(parser): return parser +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + +def _ner_example(ner): + doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"]) + gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]} + return Example.from_dict(doc, gold) + + def test_add_label(parser): parser = _train_parser(parser) parser.add_label("right") sgd = Adam(0.001) for i in range(100): losses = {} - doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} - example = Example.from_dict(doc, gold) - parser.update([example], sgd=sgd, losses=losses) + parser.update([_parser_example(parser)], sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].dep_ == "right" @@ -75,7 +84,7 @@ def test_add_label_deserializes_correctly(): ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") - ner1.begin_training(lambda: []) + ner1.begin_training(lambda: [_ner_example(ner1)]) ner2 = EntityRecognizer(Vocab(), model, **config) # the second model needs to be resized before we can call from_bytes diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index fd1880030..826fc1d87 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -1,7 +1,7 @@ import pytest from spacy.vocab import Vocab from spacy import registry -from spacy.gold import Example +from spacy.training import Example from spacy.pipeline import DependencyParser from spacy.tokens import Doc from spacy.pipeline._parser_internals.nonproj import projectivize diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index c7a1ed0d2..548cd2697 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -4,7 +4,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.lookups import Lookups from spacy.pipeline._parser_internals.ner import BiluoPushDown -from spacy.gold import Example +from spacy.training import Example from spacy.tokens import Doc from spacy.vocab import Vocab import logging diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 6594c7e78..0747241d8 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -1,7 +1,7 @@ import pytest from spacy import registry -from spacy.gold import Example +from spacy.training import Example from spacy.vocab import Vocab from spacy.pipeline._parser_internals.arc_eager import ArcEager from spacy.pipeline.transition_parser import Parser diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 8265a8a45..8d45e2132 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -3,7 +3,7 @@ import pytest from spacy.lang.en import English from ..util import get_doc, apply_transition_sequence, make_tempdir from ... import util -from ...gold import Example +from ...training import Example TRAIN_DATA = [ ( @@ -85,7 +85,7 @@ def test_parser_merge_pp(en_tokenizer): pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"] tokens = en_tokenizer(text) doc = get_doc( - tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos, + tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos ) with doc.retokenize() as retokenizer: for np in doc.noun_chunks: diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 594498b0b..1de05be1b 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -3,7 +3,7 @@ from thinc.api import Adam from spacy.attrs import NORM from spacy.vocab import Vocab from spacy import registry -from spacy.gold import Example +from spacy.training import Example from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL from spacy.tokens import Doc from spacy.pipeline import DependencyParser @@ -14,6 +14,12 @@ def vocab(): return Vocab(lex_attr_getters={NORM: lambda s: s}) +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + @pytest.fixture def parser(vocab): config = { @@ -28,7 +34,7 @@ def parser(vocab): parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") - parser.begin_training(lambda: [], **parser.cfg) + parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) sgd = Adam(0.001) for i in range(10): diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index c12a2b650..9254688cc 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -1,6 +1,6 @@ import pytest import numpy -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.pipeline import AttributeRuler from spacy import util, registry diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 4eaa71272..c43d2c58e 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -4,7 +4,7 @@ import pytest from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy import util, registry -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.tests.util import make_tempdir from spacy.tokens import Span @@ -281,11 +281,12 @@ def test_append_invalid_alias(nlp): def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" + vector_length = 1 @registry.misc.register("myLocationsKB.v1") def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: def create_kb(vocab): - mykb = KnowledgeBase(vocab, entity_vector_length=1) + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) @@ -305,10 +306,9 @@ def test_preserving_links_asdoc(nlp): ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False} - el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True) - el_pipe.begin_training(lambda: []) - el_pipe.incl_context = False - el_pipe.incl_prior = True + entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True) + nlp.begin_training() + assert entity_linker.model.get_dim("nO") == vector_length # test whether the entity links are preserved by the `as_doc()` function text = "She lives in Boston. He lives in Denver." @@ -373,6 +373,7 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() nlp.add_pipe("sentencizer") + vector_length = 3 # Add a custom component to recognize "Russ Cochran" as an entity for the example training data patterns = [ @@ -393,7 +394,7 @@ def test_overfitting_IO(): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher - mykb = KnowledgeBase(vocab, entity_vector_length=3) + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( @@ -406,14 +407,17 @@ def test_overfitting_IO(): return create_kb # Create the Entity Linker component and add it to the pipeline - nlp.add_pipe( + entity_linker = nlp.add_pipe( "entity_linker", config={"kb_loader": {"@misc": "myOverfittingKB.v1"}}, last=True, ) # train the NEL pipe - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) + assert entity_linker.model.get_dim("nO") == vector_length + assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length + for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 501c00f84..864c7332e 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,7 +1,7 @@ import pytest from spacy import util -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir @@ -25,27 +25,61 @@ TRAIN_DATA = [ }, ), # test combinations of morph+POS - ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]},), + ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]}), ] +def test_no_label(): + nlp = Language() + nlp.add_pipe("morphologizer") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_implicit_label(): + nlp = Language() + nlp.add_pipe("morphologizer") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +def test_no_resize(): + nlp = Language() + morphologizer = nlp.add_pipe("morphologizer") + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB") + nlp.begin_training() + # this throws an error because the morphologizer can't be resized after initialization + with pytest.raises(ValueError): + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ") + + +def test_begin_training_examples(): + nlp = Language() + morphologizer = nlp.add_pipe("morphologizer") + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly nlp = English() - morphologizer = nlp.add_pipe("morphologizer") + nlp.add_pipe("morphologizer") train_examples = [] for inst in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) - for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]): - if morph and pos: - morphologizer.add_label( - morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos - ) - elif pos: - morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos) - elif morph: - morphologizer.add_label(morph) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) for i in range(50): losses = {} @@ -55,18 +89,8 @@ def test_overfitting_IO(): # test the trained model test_text = "I like blue ham" doc = nlp(test_text) - gold_morphs = [ - "Feat=N", - "Feat=V", - "", - "", - ] - gold_pos_tags = [ - "NOUN", - "VERB", - "ADJ", - "", - ] + gold_morphs = ["Feat=N", "Feat=V", "", ""] + gold_pos_tags = ["NOUN", "VERB", "ADJ", ""] assert [t.morph_ for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index b64fa8581..1752df5d0 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -1,7 +1,7 @@ import pytest from spacy import util -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir @@ -30,6 +30,20 @@ TRAIN_DATA = [ ), ] +def test_begin_training_examples(): + nlp = Language() + senter = nlp.add_pipe("senter") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + def test_overfitting_IO(): # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py deleted file mode 100644 index b012a2cd6..000000000 --- a/spacy/tests/pipeline/test_simple_ner.py +++ /dev/null @@ -1,45 +0,0 @@ -from spacy.lang.en import English -from spacy.gold import Example -from spacy import util -from ..util import make_tempdir - - -TRAIN_DATA = [ - ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), - ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), -] - - -def test_overfitting_IO(): - # Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly - nlp = English() - ner = nlp.add_pipe("simple_ner") - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for ent in annotations.get("entities"): - ner.add_label(ent[2]) - optimizer = nlp.begin_training() - - for i in range(50): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - assert losses["ner"] < 0.0001 - - # test the trained model - test_text = "I like London." - doc = nlp(test_text) - ents = doc.ents - assert len(ents) == 1 - assert ents[0].text == "London" - assert ents[0].label_ == "LOC" - - # Also test the results are still the same after IO - with make_tempdir() as tmp_dir: - nlp.to_disk(tmp_dir) - nlp2 = util.load_model_from_path(tmp_dir) - doc2 = nlp2(test_text) - ents2 = doc2.ents - assert len(ents2) == 1 - assert ents2[0].text == "London" - assert ents2[0].label_ == "LOC" diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 540301eac..cd5927675 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,6 +1,6 @@ import pytest from spacy import util -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.language import Language @@ -34,6 +34,56 @@ TRAIN_DATA = [ ] +def test_no_label(): + nlp = Language() + nlp.add_pipe("tagger") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_no_resize(): + nlp = Language() + tagger = nlp.add_pipe("tagger") + tagger.add_label("N") + tagger.add_label("V") + assert tagger.labels == ("N", "V") + nlp.begin_training() + assert tagger.model.get_dim("nO") == 2 + # this throws an error because the tagger can't be resized after initialization + with pytest.raises(ValueError): + tagger.add_label("J") + + +def test_implicit_label(): + nlp = Language() + nlp.add_pipe("tagger") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +def test_begin_training_examples(): + nlp = Language() + tagger = nlp.add_pipe("tagger") + train_examples = [] + for tag in TAGS: + tagger.add_label(tag) + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: train_examples[0]) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=lambda: []) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly nlp = English() @@ -41,9 +91,8 @@ def test_overfitting_IO(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - for tag in TAGS: - tagger.add_label(tag) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) + assert tagger.model.get_dim("nO") == len(TAGS) for i in range(50): losses = {} diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 12ead90cb..3f9506bb1 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -10,7 +10,7 @@ from spacy.tokens import Doc from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from ..util import make_tempdir -from ...gold import Example +from ...training import Example TRAIN_DATA = [ @@ -80,6 +80,51 @@ def test_label_types(): textcat.add_label(9) +def test_no_label(): + nlp = Language() + nlp.add_pipe("textcat") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_implicit_label(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +def test_no_resize(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + textcat.add_label("POSITIVE") + textcat.add_label("NEGATIVE") + nlp.begin_training() + assert textcat.model.get_dim("nO") == 2 + # this throws an error because the textcat can't be resized after initialization + with pytest.raises(ValueError): + textcat.add_label("NEUTRAL") + + +def test_begin_training_examples(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for label, value in annotations.get("cats").items(): + textcat.add_label(label) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly fix_random_seed(0) @@ -89,9 +134,8 @@ def test_overfitting_IO(): train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for label, value in annotations.get("cats").items(): - textcat.add_label(label) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) + assert textcat.model.get_dim("nO") == 2 for i in range(50): losses = {} diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 5c93ea3c8..ed5bcc1a5 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -1,7 +1,7 @@ import pytest import random from spacy import util -from spacy.gold import Example +from spacy.training import Example from spacy.matcher import Matcher from spacy.attrs import IS_PUNCT, ORTH, LOWER from spacy.vocab import Vocab diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 4988575ea..c1d726db6 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -3,7 +3,7 @@ import gc import numpy import copy -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.lex_attrs import is_stop diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 259ca9b0c..357fbb84e 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -3,7 +3,7 @@ import numpy from spacy.tokens import Doc from spacy.matcher import Matcher from spacy.displacy import render -from spacy.gold import iob_to_biluo +from spacy.training import iob_to_biluo from spacy.lang.it import Italian from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 3882df0a6..beb8faca1 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -1,6 +1,6 @@ import pytest from spacy import displacy -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.lang.ja import Japanese from spacy.lang.xx import MultiLanguage @@ -20,7 +20,7 @@ def test_issue2564(): nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("A") - tagger.begin_training(lambda: []) + nlp.begin_training() doc = nlp("hello world") assert doc.is_tagged docs = nlp.pipe(["hello", "world"]) diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index fc2a3ed7c..d36e693c7 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -9,7 +9,7 @@ from spacy.tokens import Doc, Token from spacy.matcher import Matcher, PhraseMatcher from spacy.errors import MatchPatternError from spacy.util import minibatch -from spacy.gold import Example +from spacy.training import Example from spacy.lang.hi import Hindi from spacy.lang.es import Spanish from spacy.lang.en import English @@ -251,6 +251,12 @@ def test_issue3803(): assert [t.like_num for t in doc] == [True, True, True, True, True, True] +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" config = { @@ -264,7 +270,7 @@ def test_issue3830_no_subtok(): parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels - parser.begin_training(lambda: []) + parser.begin_training(lambda: [_parser_example(parser)]) assert "subtok" not in parser.labels @@ -281,7 +287,7 @@ def test_issue3830_with_subtok(): parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels - parser.begin_training(lambda: []) + parser.begin_training(lambda: [_parser_example(parser)]) assert "subtok" in parser.labels diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index e846841d4..2beccedcf 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -2,8 +2,8 @@ import pytest from spacy.pipeline import Pipe from spacy.matcher import PhraseMatcher, Matcher from spacy.tokens import Doc, Span, DocBin -from spacy.gold import Example, Corpus -from spacy.gold.converters import json2docs +from spacy.training import Example, Corpus +from spacy.training.converters import json2docs from spacy.vocab import Vocab from spacy.lang.en import English from spacy.util import minibatch, ensure_path, load_model diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index d83a2c718..9454d7f0c 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -1,9 +1,7 @@ import pytest -from mock import Mock -from spacy.matcher import DependencyMatcher from spacy.tokens import Doc, Span, DocBin -from spacy.gold import Example -from spacy.gold.converters.conllu2docs import conllu2docs +from spacy.training import Example +from spacy.training.converters.conllu2docs import conllu2docs from spacy.lang.en import English from spacy.kb import KnowledgeBase from spacy.vocab import Vocab @@ -12,7 +10,7 @@ from spacy.util import ensure_path, load_model_from_path import numpy import pickle -from ..util import get_doc, make_tempdir +from ..util import make_tempdir def test_issue4528(en_vocab): diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index af643aadc..531e48ec3 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -64,7 +64,7 @@ def tagger(): # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization tagger.add_label("A") - tagger.begin_training(lambda: [], pipeline=nlp.pipeline) + nlp.begin_training() return tagger @@ -85,7 +85,7 @@ def entity_linker(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline) + nlp.begin_training() return entity_linker diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index aa8ea6051..0df707dc0 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,14 +1,15 @@ import pytest from click import NoSuchOption -from spacy.gold import docs_to_json, biluo_tags_from_offsets -from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs +from spacy.training import docs_to_json, biluo_tags_from_offsets +from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.lang.en import English from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.cli.pretrain import make_docs from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import load_project_config, substitute_project_variables +from spacy.cli._util import string_to_list from thinc.config import ConfigValidationError import srsly @@ -372,17 +373,13 @@ def test_parse_config_overrides(args, expected): assert parse_config_overrides(args) == expected -@pytest.mark.parametrize( - "args", [["--foo"], ["--x.foo", "bar", "--baz"]], -) +@pytest.mark.parametrize("args", [["--foo"], ["--x.foo", "bar", "--baz"]]) def test_parse_config_overrides_invalid(args): with pytest.raises(NoSuchOption): parse_config_overrides(args) -@pytest.mark.parametrize( - "args", [["--x.foo", "bar", "baz"], ["x.foo"]], -) +@pytest.mark.parametrize("args", [["--x.foo", "bar", "baz"], ["x.foo"]]) def test_parse_config_overrides_invalid_2(args): with pytest.raises(SystemExit): parse_config_overrides(args) @@ -401,3 +398,44 @@ def test_init_config(lang, pipeline, optimize): def test_model_recommendations(): for lang, data in RECOMMENDATIONS.items(): assert RecommendationSchema(**data) + + +@pytest.mark.parametrize( + "value", + [ + # fmt: off + "parser,textcat,tagger", + " parser, textcat ,tagger ", + 'parser,textcat,tagger', + ' parser, textcat ,tagger ', + ' "parser"," textcat " ,"tagger "', + " 'parser',' textcat ' ,'tagger '", + '[parser,textcat,tagger]', + '["parser","textcat","tagger"]', + '[" parser" ,"textcat ", " tagger " ]', + "[parser,textcat,tagger]", + "[ parser, textcat , tagger]", + "['parser','textcat','tagger']", + "[' parser' , 'textcat', ' tagger ' ]", + # fmt: on + ], +) +def test_string_to_list(value): + assert string_to_list(value, intify=False) == ["parser", "textcat", "tagger"] + + +@pytest.mark.parametrize( + "value", + [ + # fmt: off + "1,2,3", + '[1,2,3]', + '["1","2","3"]', + '[" 1" ,"2 ", " 3 " ]', + "[' 1' , '2', ' 3 ' ]", + # fmt: on + ], +) +def test_string_to_list_intify(value): + assert string_to_list(value, intify=False) == ["1", "2", "3"] + assert string_to_list(value, intify=True) == [1, 2, 3] diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index ebc804235..840d878c2 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -3,7 +3,7 @@ import pytest from spacy.language import Language from spacy.tokens import Doc, Span from spacy.vocab import Vocab -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.util import registry diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 321eaae95..597809286 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -1,5 +1,5 @@ import pytest -from spacy.gold.example import Example +from spacy.training.example import Example from spacy.tokens import Doc from spacy.vocab import Vocab diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 6dae14210..fb96c0361 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,8 +1,8 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx -from spacy.gold import Example -from spacy.gold.iob_utils import biluo_tags_from_offsets +from spacy.training import Example +from spacy.training.iob_utils import biluo_tags_from_offsets from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from .util import get_doc diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 9f0f4b74a..fb30c6ae5 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -6,7 +6,7 @@ from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener from spacy.vocab import Vocab from spacy.tokens import Doc -from spacy.gold import Example +from spacy.training import Example from spacy import util from spacy.lang.en import English from .util import get_batch @@ -89,6 +89,7 @@ def test_init_tok2vec(): tok2vec = nlp.add_pipe("tok2vec") assert tok2vec.listeners == [] nlp.begin_training() + assert tok2vec.model.get_dim("nO") cfg_string = """ diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_training.py similarity index 98% rename from spacy/tests/test_gold.py rename to spacy/tests/test_training.py index 334d9fc24..1926aca1f 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_training.py @@ -1,9 +1,10 @@ import numpy -from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment -from spacy.gold import spans_from_biluo_tags, iob_to_biluo -from spacy.gold import Corpus, docs_to_json -from spacy.gold.example import Example -from spacy.gold.converters import json2docs +from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment +from spacy.training import spans_from_biluo_tags, iob_to_biluo +from spacy.training import Corpus, docs_to_json +from spacy.training.example import Example +from spacy.training.converters import json2docs +from spacy.training.augment import make_orth_variants_example from spacy.lang.en import English from spacy.tokens import Doc, DocBin from spacy.util import get_words_and_spaces, minibatch @@ -12,7 +13,6 @@ import pytest import srsly from .util import make_tempdir -from ..gold.augment import make_orth_variants_example @pytest.fixture diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 40cd71eb5..1f073ab32 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -5,7 +5,7 @@ from .util import get_random_doc from spacy import util from spacy.util import dot_to_object, SimpleFrozenList from thinc.api import Config, Optimizer -from spacy.gold.batchers import minibatch_by_words +from spacy.training.batchers import minibatch_by_words from ..lang.en import English from ..lang.nl import Dutch from ..language import DEFAULT_CONFIG_PATH diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 5e7222d40..787cca652 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -24,7 +24,7 @@ from .util import registry from .attrs import intify_attrs from .symbols import ORTH from .scorer import Scorer -from .gold import validate_examples +from .training import validate_examples cdef class Tokenizer: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3f8c735fb..93520aeda 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -576,7 +576,7 @@ cdef class Doc: entity_type = 0 kb_id = 0 - # Set ent_iob to Missing (0) bij default unless this token was nered before + # Set ent_iob to Missing (0) by default unless this token was nered before ent_iob = 0 if self.c[i].ent_iob != 0: ent_iob = 2 diff --git a/spacy/gold/__init__.pxd b/spacy/training/__init__.pxd similarity index 100% rename from spacy/gold/__init__.pxd rename to spacy/training/__init__.pxd diff --git a/spacy/gold/__init__.py b/spacy/training/__init__.py similarity index 100% rename from spacy/gold/__init__.py rename to spacy/training/__init__.py diff --git a/spacy/gold/align.py b/spacy/training/align.py similarity index 100% rename from spacy/gold/align.py rename to spacy/training/align.py diff --git a/spacy/gold/augment.py b/spacy/training/augment.py similarity index 100% rename from spacy/gold/augment.py rename to spacy/training/augment.py diff --git a/spacy/gold/batchers.py b/spacy/training/batchers.py similarity index 100% rename from spacy/gold/batchers.py rename to spacy/training/batchers.py diff --git a/spacy/gold/converters/__init__.py b/spacy/training/converters/__init__.py similarity index 100% rename from spacy/gold/converters/__init__.py rename to spacy/training/converters/__init__.py diff --git a/spacy/gold/converters/conll_ner2docs.py b/spacy/training/converters/conll_ner2docs.py similarity index 99% rename from spacy/gold/converters/conll_ner2docs.py rename to spacy/training/converters/conll_ner2docs.py index c04a77f07..8dcaf2599 100644 --- a/spacy/gold/converters/conll_ner2docs.py +++ b/spacy/training/converters/conll_ner2docs.py @@ -1,7 +1,7 @@ from wasabi import Printer from .. import tags_to_entities -from ...gold import iob_to_biluo +from ...training import iob_to_biluo from ...lang.xx import MultiLanguage from ...tokens import Doc, Span from ...util import load_model diff --git a/spacy/gold/converters/conllu2docs.py b/spacy/training/converters/conllu2docs.py similarity index 99% rename from spacy/gold/converters/conllu2docs.py rename to spacy/training/converters/conllu2docs.py index 11ee86182..85afdeef3 100644 --- a/spacy/gold/converters/conllu2docs.py +++ b/spacy/training/converters/conllu2docs.py @@ -1,7 +1,7 @@ import re from .conll_ner2docs import n_sents_info -from ...gold import iob_to_biluo, spans_from_biluo_tags +from ...training import iob_to_biluo, spans_from_biluo_tags from ...tokens import Doc, Token, Span from ...vocab import Vocab from wasabi import Printer diff --git a/spacy/gold/converters/iob2docs.py b/spacy/training/converters/iob2docs.py similarity index 97% rename from spacy/gold/converters/iob2docs.py rename to spacy/training/converters/iob2docs.py index eebf1266b..f8076c5ab 100644 --- a/spacy/gold/converters/iob2docs.py +++ b/spacy/training/converters/iob2docs.py @@ -1,7 +1,7 @@ from wasabi import Printer from .conll_ner2docs import n_sents_info -from ...gold import iob_to_biluo, tags_to_entities +from ...training import iob_to_biluo, tags_to_entities from ...tokens import Doc, Span from ...util import minibatch diff --git a/spacy/gold/converters/json2docs.py b/spacy/training/converters/json2docs.py similarity index 100% rename from spacy/gold/converters/json2docs.py rename to spacy/training/converters/json2docs.py diff --git a/spacy/gold/corpus.py b/spacy/training/corpus.py similarity index 100% rename from spacy/gold/corpus.py rename to spacy/training/corpus.py diff --git a/spacy/gold/example.pxd b/spacy/training/example.pxd similarity index 100% rename from spacy/gold/example.pxd rename to spacy/training/example.pxd diff --git a/spacy/gold/example.pyx b/spacy/training/example.pyx similarity index 100% rename from spacy/gold/example.pyx rename to spacy/training/example.pyx diff --git a/spacy/gold/gold_io.pyx b/spacy/training/gold_io.pyx similarity index 100% rename from spacy/gold/gold_io.pyx rename to spacy/training/gold_io.pyx diff --git a/spacy/gold/iob_utils.py b/spacy/training/iob_utils.py similarity index 97% rename from spacy/gold/iob_utils.py rename to spacy/training/iob_utils.py index 08751cfd4..ceb5e16b8 100644 --- a/spacy/gold/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -195,13 +195,15 @@ def tags_to_entities(tags): continue elif tag.startswith("I"): if start is None: - raise ValueError(Errors.E067.format(tags=tags[: i + 1])) + raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1])) continue if tag.startswith("U"): entities.append((tag[2:], i, i)) elif tag.startswith("B"): start = i elif tag.startswith("L"): + if start is None: + raise ValueError(Errors.E067.format(start="L", tags=tags[: i + 1])) entities.append((tag[2:], start, i)) start = None else: diff --git a/spacy/gold/loggers.py b/spacy/training/loggers.py similarity index 100% rename from spacy/gold/loggers.py rename to spacy/training/loggers.py diff --git a/spacy/util.py b/spacy/util.py index fa4815df8..d8df04554 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -93,6 +93,7 @@ class registry(thinc.registry): # environment. spaCy models packaged with `spacy package` will "advertise" # themselves via entry points. models = catalogue.create("spacy", "models", entry_points=True) + cli = catalogue.create("spacy", "cli", entry_points=True) class SimpleFrozenDict(dict): @@ -647,7 +648,7 @@ def join_command(command: List[str]) -> str: return " ".join(shlex.quote(cmd) for cmd in command) -def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None: +def run_command(command: Union[str, List[str]], *, capture=False, stdin=None): """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. diff --git a/website/README.md b/website/README.md index f3a64d1cb..825d13c65 100644 --- a/website/README.md +++ b/website/README.md @@ -289,11 +289,11 @@ always be the **last element** in the row. > | Column 1 | Column 2 ~~List[Doc]~~ | > ``` -| Name | Description | -| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ | -| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | +| Name | Description | +| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ | +| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | ### List {#list} @@ -609,7 +609,6 @@ In addition to the native markdown elements, you can use the components ├── docs # the actual markdown content ├── meta # JSON-formatted site metadata | ├── languages.json # supported languages and statistical models -| ├── logos.json # logos and links for landing page | ├── sidebars.json # sidebar navigations for different sections | ├── site.json # general site metadata | └── universe.json # data for the spaCy universe section diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index ee844d961..30d863b17 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -181,10 +181,10 @@ characters would be `"jumpping"`: 4 from the start, 4 from the end. This ensures that the final character is always in the last position, instead of being in an arbitrary position depending on the word length. -The characters are embedded in a embedding table with 256 rows, and the vectors -concatenated. A hash-embedded vector of the `NORM` of the word is also -concatenated on, and the result is then passed through a feed-forward network to -construct a single vector to represent the information. +The characters are embedded in a embedding table with a given number of rows, +and the vectors concatenated. A hash-embedded vector of the `NORM` of the word +is also concatenated on, and the result is then passed through a feed-forward +network to construct a single vector to represent the information. | Name | Description | | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -456,62 +456,6 @@ consists of either two or three subnetworks: | `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | | **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | -### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"} - -> #### Example Config -> -> ```ini -> [model] -> @architectures = "spacy.BILUOTagger.v1 " -> -> [model.tok2vec] -> @architectures = "spacy.HashEmbedCNN.v1" -> # etc. -> ``` - -Construct a simple NER tagger that predicts -[BILUO](/usage/linguistic-features#accessing-ner) tag scores for each token and -uses greedy decoding with transition-constraints to return a valid BILUO tag -sequence. A BILUO tag sequence encodes a sequence of non-overlapping labelled -spans into tags assigned to each token. The first token of a span is given the -tag `B-LABEL`, the last token of the span is given the tag `L-LABEL`, and tokens -within the span are given the tag `U-LABEL`. Single-token spans are given the -tag `U-LABEL`. All other tokens are assigned the tag `O`. The BILUO tag scheme -generally results in better linear separation between classes, especially for -non-CRF models, because there are more distinct classes for the different -situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)). - -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------ | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | - -### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"} - -> #### Example Config -> -> ```ini -> [model] -> @architectures = "spacy.IOBTagger.v1 " -> -> [model.tok2vec] -> @architectures = "spacy.HashEmbedCNN.v1" -> # etc. -> ``` - -Construct a simple NER tagger, that predicts -[IOB](/usage/linguistic-features#accessing-ner) tag scores for each token and -uses greedy decoding with transition-constraints to return a valid IOB tag -sequence. An IOB tag sequence encodes a sequence of non-overlapping labeled -spans into tags assigned to each token. The first token of a span is given the -tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens -are assigned the tag O. - -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------ | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | - ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} ### spacy.Tagger.v1 {#Tagger} diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index fc72eda98..53c8c46cf 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -38,7 +38,7 @@ how the component should be configured. You can override its settings via the | `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py +%%GITHUB_SPACY/spacy/pipeline/attributeruler.py ``` ## AttributeRuler.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 7852d0482..55e552e72 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -229,13 +229,13 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] ### Converters {#converters} -| ID | Description | -| ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension and file content (default). | -| `json` | JSON-formatted training data used in spaCy v2.x. | -| `conll` | Universal Dependencies `.conllu` or `.conll` format. | -| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | +| ID | Description | +| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension and file content (default). | +| `json` | JSON-formatted training data used in spaCy v2.x. | +| `conll` | Universal Dependencies `.conllu` or `.conll` format. | +| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | +| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | ## debug {#debug new="3"} @@ -357,38 +357,38 @@ File /path/to/spacy/ml/models/tok2vec.py (line 207) ℹ [training.logger] Registry @loggers Name spacy.ConsoleLogger.v1 -Module spacy.gold.loggers -File /path/to/spacy/gold/loggers.py (line 8) +Module spacy.training.loggers +File /path/to/spacy/training/loggers.py (line 8) ℹ [training.batcher] Registry @batchers Name spacy.batch_by_words.v1 -Module spacy.gold.batchers -File /path/to/spacy/gold/batchers.py (line 49) +Module spacy.training.batchers +File /path/to/spacy/training/batchers.py (line 49) ℹ [training.batcher.size] Registry @schedules Name compounding.v1 Module thinc.schedules -File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 43) +File /path/to/thinc/thinc/schedules.py (line 43) ℹ [training.dev_corpus] Registry @readers Name spacy.Corpus.v1 -Module spacy.gold.corpus -File /path/to/spacy/gold/corpus.py (line 18) +Module spacy.training.corpus +File /path/to/spacy/training/corpus.py (line 18) ℹ [training.optimizer] Registry @optimizers Name Adam.v1 Module thinc.optimizers -File /Users/ines/Repos/explosion/thinc/thinc/optimizers.py (line 58) +File /path/to/thinc/thinc/optimizers.py (line 58) ℹ [training.optimizer.learn_rate] Registry @schedules Name warmup_linear.v1 Module thinc.schedules -File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 91) +File /path/to/thinc/thinc/schedules.py (line 91) ℹ [training.train_corpus] Registry @readers Name spacy.Corpus.v1 -Module spacy.gold.corpus -File /path/to/spacy/gold/corpus.py (line 18) +Module spacy.training.corpus +File /path/to/spacy/training/corpus.py (line 18) ``` @@ -852,7 +852,7 @@ this, you can set the `--no-sdist` flag. ```cli -$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--no-sdist] [--version] [--force] +$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--no-sdist] [--name] [--version] [--force] ``` > #### Example @@ -870,6 +870,7 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] | `--meta-path`, `-m` 2 | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ | | `--create-meta`, `-C` 2 | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ | | `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ | +| `--name`, `-n` 3 | Package name to override in meta. ~~Optional[str] \(option)~~ | | `--version`, `-v` 3 | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ | | `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | @@ -893,7 +894,7 @@ can provide any other repo (public or private) that you have access to using the ```cli -$ python -m spacy project clone [name] [dest] [--repo] +$ python -m spacy project clone [name] [dest] [--repo] [--branch] ``` > #### Example @@ -908,13 +909,14 @@ $ python -m spacy project clone [name] [dest] [--repo] > $ python -m spacy project clone template --repo https://github.com/your_org/your_repo > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ | -| `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ | -| `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | The cloned [project directory](/usage/projects#project-files). | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ | +| `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ | +| `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ | +| `--branch`, `-b` | The branch to clone from. Defaults to `master`. ~~str (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | The cloned [project directory](/usage/projects#project-files). | ### project assets {#project-assets tag="command"} diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 86cfa9121..f6f6bbf68 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -2,7 +2,7 @@ title: Corpus teaser: An annotated corpus tag: class -source: spacy/gold/corpus.py +source: spacy/training/corpus.py new: 3 --- @@ -42,7 +42,7 @@ streaming. | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/gold/corpus.py +%%GITHUB_SPACY/spacy/training/corpus.py ``` ## Corpus.\_\_init\_\_ {#init tag="method"} @@ -58,7 +58,7 @@ train/test skew. > #### Example > > ```python -> from spacy.gold import Corpus +> from spacy.training import Corpus > > # With a single file > corpus = Corpus("./data/train.spacy") @@ -82,7 +82,7 @@ Yield examples from the data. > #### Example > > ```python -> from spacy.gold import Corpus +> from spacy.training import Corpus > import spacy > > corpus = Corpus("./train.spacy") diff --git a/website/docs/api/cython.md b/website/docs/api/cython.md index d7c03cf41..16b11cead 100644 --- a/website/docs/api/cython.md +++ b/website/docs/api/cython.md @@ -23,12 +23,12 @@ abruptly. With Cython there are four ways of declaring complex data types. Unfortunately we use all four in different places, as they all have different utility: -| Declaration | Description | Example | -| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | -| `class` | A normal Python class. | [`Language`](/api/language) | -| `cdef class` | A Python extension type. Differs from a normal Python class in that its attributes can be defined on the underlying struct. Can have C-level objects as attributes (notably structs and pointers), and can have methods which have C-level objects as arguments or return types. | [`Lexeme`](/api/cython-classes#lexeme) | -| `cdef struct` | A struct is just a collection of variables, sort of like a named tuple, except the memory is contiguous. Structs can't have methods, only attributes. | [`LexemeC`](/api/cython-structs#lexemec) | -| `cdef cppclass` | A C++ class. Like a struct, this can be allocated on the stack, but can have methods, a constructor and a destructor. Differs from `cdef class` in that it can be created and destroyed without acquiring the Python global interpreter lock. This style is the most obscure. | [`StateC`](https://github.com/explosion/spaCy/tree/master/spacy/syntax/_state.pxd) | +| Declaration | Description | Example | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------- | +| `class` | A normal Python class. | [`Language`](/api/language) | +| `cdef class` | A Python extension type. Differs from a normal Python class in that its attributes can be defined on the underlying struct. Can have C-level objects as attributes (notably structs and pointers), and can have methods which have C-level objects as arguments or return types. | [`Lexeme`](/api/cython-classes#lexeme) | +| `cdef struct` | A struct is just a collection of variables, sort of like a named tuple, except the memory is contiguous. Structs can't have methods, only attributes. | [`LexemeC`](/api/cython-structs#lexemec) | +| `cdef cppclass` | A C++ class. Like a struct, this can be allocated on the stack, but can have methods, a constructor and a destructor. Differs from `cdef class` in that it can be created and destroyed without acquiring the Python global interpreter lock. This style is the most obscure. | [`StateC`](%%GITHUB_SPACY/spacy/pipeline/_parser_internals/_state.pxd) | The most important classes in spaCy are defined as `cdef class` objects. The underlying data for these objects is usually gathered into a struct, which is diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 3fd2818f4..3d78df39d 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -37,7 +37,7 @@ recommended settings for your use case, check out the > guide on [registered functions](/usage/training#config-functions) for details. ```ini -https://github.com/explosion/spaCy/blob/develop/spacy/default_config.cfg +%%GITHUB_SPACY/spacy/default_config.cfg ``` @@ -45,8 +45,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/default_config.cfg Under the hood, spaCy's configs are powered by our machine learning library [Thinc's config system](https://thinc.ai/docs/usage-config), which uses [`pydantic`](https://github.com/samuelcolvin/pydantic/) for data validation -based on type hints. See -[`spacy/schemas.py`](https://github.com/explosion/spaCy/blob/develop/spacy/schemas.py) +based on type hints. See [`spacy/schemas.py`](%%GITHUB_SPACY/spacy/schemas.py) for the schemas used to validate the default config. Arguments of registered functions are validated against their type annotations, if available. To debug your config and check that it's valid, you can run the @@ -175,7 +174,7 @@ run [`spacy pretrain`](/api/cli#pretrain). > > ```python > from spacy.tokens import DocBin -> from spacy.gold import Corpus +> from spacy.training import Corpus > > doc_bin = DocBin(docs=docs) > doc_bin.to_disk("./data.spacy") @@ -456,7 +455,7 @@ lexical data. Here's an example of the 20 most frequent lexemes in the English training data: ```json -https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl +%%GITHUB_SPACY / extra / example_data / vocab - data.jsonl ``` ## Pipeline meta {#meta} diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 7a09a840a..674812567 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -57,7 +57,7 @@ architectures and their arguments and hyperparameters. | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/dep_parser.pyx +%%GITHUB_SPACY/spacy/pipeline/dep_parser.pyx ``` ## DependencyParser.\_\_init\_\_ {#init tag="method"} @@ -293,7 +293,12 @@ context, the original parameters are restored. ## DependencyParser.add_label {#add_label tag="method"} -Add a new label to the pipe. +Add a new label to the pipe. Note that you don't have to call this method if you +provide a **representative data sample** to the +[`begin_training`](#begin_training) method. In this case, all labels found in +the sample will be automatically added to the model, and the output dimension +will be [inferred](/usage/layers-architectures#thinc-shape-inference) +automatically. > #### Example > @@ -307,6 +312,25 @@ Add a new label to the pipe. | `label` | The label to add. ~~str~~ | | **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | +## DependencyParser.set_output {#set_output tag="method"} + +Change the output dimension of the component's model by calling the model's +attribute `resize_output`. This is a function that takes the original model and +the new output dimension `nO`, and changes the model in place. When resizing an +already trained model, care should be taken to avoid the "catastrophic +forgetting" problem. + +> #### Example +> +> ```python +> parser = nlp.add_pipe("parser") +> parser.set_output(512) +> ``` + +| Name | Description | +| ---- | --------------------------------- | +| `nO` | The new output dimension. ~~int~~ | + ## DependencyParser.to_disk {#to_disk tag="method"} Serialize the pipe to disk. diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 8cde6c490..a9d45d68e 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -50,7 +50,7 @@ architectures and their arguments and hyperparameters. | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py +%%GITHUB_SPACY/spacy/pipeline/entity_linker.py ``` ## EntityLinker.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index b6b9caa84..1420aa1a7 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -48,7 +48,7 @@ architectures and their arguments and hyperparameters. | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/ner.pyx +%%GITHUB_SPACY/spacy/pipeline/ner.pyx ``` ## EntityRecognizer.\_\_init\_\_ {#init tag="method"} @@ -281,7 +281,12 @@ context, the original parameters are restored. ## EntityRecognizer.add_label {#add_label tag="method"} -Add a new label to the pipe. +Add a new label to the pipe. Note that you don't have to call this method if you +provide a **representative data sample** to the +[`begin_training`](#begin_training) method. In this case, all labels found in +the sample will be automatically added to the model, and the output dimension +will be [inferred](/usage/layers-architectures#thinc-shape-inference) +automatically. > #### Example > @@ -295,6 +300,25 @@ Add a new label to the pipe. | `label` | The label to add. ~~str~~ | | **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | +## EntityRecognizer.set_output {#set_output tag="method"} + +Change the output dimension of the component's model by calling the model's +attribute `resize_output`. This is a function that takes the original model and +the new output dimension `nO`, and changes the model in place. When resizing an +already trained model, care should be taken to avoid the "catastrophic +forgetting" problem. + +> #### Example +> +> ```python +> ner = nlp.add_pipe("ner") +> ner.set_output(512) +> ``` + +| Name | Description | +| ---- | --------------------------------- | +| `nO` | The new output dimension. ~~int~~ | + ## EntityRecognizer.to_disk {#to_disk tag="method"} Serialize the pipe to disk. diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 454b2a04b..a6934eeef 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -42,7 +42,7 @@ how the component should be configured. You can override its settings via the | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entityruler.py +%%GITHUB_SPACY/spacy/pipeline/entityruler.py ``` ## EntityRuler.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 2434cce43..668c8028f 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -2,7 +2,7 @@ title: Example teaser: A training instance tag: class -source: spacy/gold/example.pyx +source: spacy/training/example.pyx new: 3.0 --- @@ -22,7 +22,7 @@ both documents. > > ```python > from spacy.tokens import Doc -> from spacy.gold import Example +> from spacy.training import Example > > words = ["hello", "world", "!"] > spaces = [True, False, False] @@ -48,7 +48,7 @@ see the [training format documentation](/api/data-formats#dict-input). > > ```python > from spacy.tokens import Doc -> from spacy.gold import Example +> from spacy.training import Example > > predicted = Doc(vocab, words=["Apply", "some", "sunscreen"]) > token_ref = ["Apply", "some", "sun", "screen"] @@ -301,7 +301,7 @@ tokenizations add up to the same string. For example, you'll be able to align > #### Example > > ```python -> from spacy.gold import Alignment +> from spacy.training import Alignment > > bert_tokens = ["obama", "'", "s", "podcast"] > spacy_tokens = ["obama", "'s", "podcast"] diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 7799f103b..c24023177 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -205,8 +205,15 @@ examples can either be the full training data or a representative sample. They are used to **initialize the models** of trainable pipeline components and are passed each component's [`begin_training`](/api/pipe#begin_training) method, if available. Initialization includes validating the network, -[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and -setting up the label scheme based on the data. +[inferring missing shapes](/usage/layers-architectures#thinc-shape-inference) +and setting up the label scheme based on the data. + +If no `get_examples` function is provided when calling `nlp.begin_training`, the +pipeline components will be initialized with generic data. In this case, it is +crucial that the output dimension of each component has already been defined +either in the [config](/usage/training#config), or by calling +[`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for +the tagger or textcat). @@ -937,11 +944,11 @@ available to the loaded object. ## Class attributes {#class-attributes} -| Name | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ | -| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ | -| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](https://github.com/explosion/spaCy/tree/develop/spacy/default_config.cfg). ~~Config~~ | +| Name | Description | +| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ | +| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ | +| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ | ## Defaults {#defaults} @@ -974,34 +981,17 @@ customize the default language data: > config = Config().from_str(DEFAULT_CONFIG) > ``` -| Name | Description | -| --------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `stop_words` | List of stop words, used for `Token.is_stop`.
**Example:** [`stop_words.py`][stop_words.py] ~~Set[str]~~ | -| `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.
**Example:** [`de/tokenizer_exceptions.py`][de/tokenizer_exceptions.py] ~~Dict[str, List[dict]]~~ | -| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.
**Example:** [`puncutation.py`][punctuation.py] ~~Optional[List[Union[str, Pattern]]]~~ | -| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.
**Example:** [`fr/tokenizer_exceptions.py`][fr/tokenizer_exceptions.py] ~~Optional[Pattern]~~ | -| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.
**Example:** [`tokenizer_exceptions.py`][tokenizer_exceptions.py] ~~Optional[Pattern]~~ | -| `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.
**Example:** [`lex_attrs.py`][lex_attrs.py] ~~Dict[int, Callable[[str], Any]]~~ | -| `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).
**Example:** [`syntax_iterators.py`][syntax_iterators.py]. ~~Dict[str, Callable[[Union[Doc, Span]], Iterator[Span]]]~~ | -| `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.
**Example:** [`zh/__init__.py`][zh/__init__.py] ~~Dict[str, Any]~~ | -| `config` | Default [config](/usage/training#config) added to `nlp.config`. This can include references to custom tokenizers or lemmatizers.
**Example:** [`zh/__init__.py`][zh/__init__.py] ~~Config~~ | - -[stop_words.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py -[tokenizer_exceptions.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/tokenizer_exceptions.py -[de/tokenizer_exceptions.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/de/tokenizer_exceptions.py -[fr/tokenizer_exceptions.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/tokenizer_exceptions.py -[punctuation.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py -[lex_attrs.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/lex_attrs.py -[syntax_iterators.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py -[zh/__init__.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/zh/__init__.py +| Name | Description | +| --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `stop_words` | List of stop words, used for `Token.is_stop`.
**Example:** [`stop_words.py`](%%GITHUB_SPACY/spacy/lang/en/stop_words.py) ~~Set[str]~~ | +| `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.
**Example:** [`de/tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/de/tokenizer_exceptions.py) ~~Dict[str, List[dict]]~~ | +| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.
**Example:** [`puncutation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) ~~Optional[List[Union[str, Pattern]]]~~ | +| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.
**Example:** [`fr/tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/fr/tokenizer_exceptions.py) ~~Optional[Pattern]~~ | +| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.
**Example:** [`tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/tokenizer_exceptions.py) ~~Optional[Pattern]~~ | +| `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.
**Example:** [`lex_attrs.py`](%%GITHUB_SPACY/spacy/lang/en/lex_attrs.py) ~~Dict[int, Callable[[str], Any]]~~ | +| `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).
**Example:** [`syntax_iterators.py`](%%GITHUB_SPACY/spacy/lang/en/syntax_iterators.py). ~~Dict[str, Callable[[Union[Doc, Span]], Iterator[Span]]]~~ | +| `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.
**Example:** [`zh/__init__.py`](%%GITHUB_SPACY/spacy/lang/zh/__init__.py) ~~Dict[str, Any]~~ | +| `config` | Default [config](/usage/training#config) added to `nlp.config`. This can include references to custom tokenizers or lemmatizers.
**Example:** [`zh/__init__.py`](%%GITHUB_SPACY/spacy/lang/zh/__init__.py) ~~Config~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 45a8736db..486410907 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -56,7 +56,7 @@ data formats used by the lookup and rule-based lemmatizers, see | `model` | **Not yet implemented:** the model to use. ~~Model~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py +%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py ``` ## Lemmatizer.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 069856ea3..f2b2f9cc0 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -37,7 +37,7 @@ architectures and their arguments and hyperparameters. | `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/morphologizer.pyx +%%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx ``` ## Morphologizer.\_\_init\_\_ {#init tag="method"} @@ -258,6 +258,13 @@ context, the original parameters are restored. Add a new label to the pipe. If the `Morphologizer` should set annotations for both `pos` and `morph`, the label should include the UPOS as the feature `POS`. +Raises an error if the output dimension is already set, or if the model has +already been fully [initialized](#begin_training). Note that you don't have to +call this method if you provide a **representative data sample** to the +[`begin_training`](#begin_training) method. In this case, all labels found in +the sample will be automatically added to the model, and the output dimension +will be [inferred](/usage/layers-architectures#thinc-shape-inference) +automatically. > #### Example > diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index 57b2af44d..c8d61a5a9 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -22,7 +22,7 @@ for how to use the `Pipe` base class to implement custom components. > inherit from `Pipe`. ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/pipe.pyx +%%GITHUB_SPACY/spacy/pipeline/pipe.pyx ``` ## Pipe.\_\_init\_\_ {#init tag="method"} @@ -286,9 +286,6 @@ context, the original parameters are restored. ## Pipe.add_label {#add_label tag="method"} -Add a new label to the pipe. It's possible to extend trained models with new -labels, but care should be taken to avoid the "catastrophic forgetting" problem. - > #### Example > > ```python @@ -296,10 +293,82 @@ labels, but care should be taken to avoid the "catastrophic forgetting" problem. > pipe.add_label("MY_LABEL") > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------- | -| `label` | The label to add. ~~str~~ | -| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | +Add a new label to the pipe, to be predicted by the model. The actual +implementation depends on the specific component, but in general `add_label` +shouldn't be called if the output dimension is already set, or if the model has +already been fully [initialized](#begin_training). If these conditions are +violated, the function will raise an Error. The exception to this rule is when +the component is [resizable](#is_resizable), in which case +[`set_output`](#set_output) should be called to ensure that the model is +properly resized. + + + +This method needs to be overwritten with your own custom `add_label` method. + + + +| Name | Description | +| ----------- | ------------------------------------------------------- | +| `label` | The label to add. ~~str~~ | +| **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ | + +Note that in general, you don't have to call `pipe.add_label` if you provide a +representative data sample to the [`begin_training`](#begin_training) method. In +this case, all labels found in the sample will be automatically added to the +model, and the output dimension will be +[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. + +## Pipe.is_resizable {#is_resizable tag="method"} + +> #### Example +> +> ```python +> can_resize = pipe.is_resizable() +> ``` +> +> With custom resizing implemented by a component: +> +> ```python +> def custom_resize(model, new_nO): +> # adjust model +> return model +> +> custom_model.attrs["resize_output"] = custom_resize +> ``` + +Check whether or not the output dimension of the component's model can be +resized. If this method returns `True`, [`set_output`](#set_output) can be +called to change the model's output dimension. + +For built-in components that are not resizable, you have to create and train a +new model from scratch with the appropriate architecture and output dimension. +For custom components, you can implement a `resize_output` function and add it +as an attribute to the component's model. + +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------- | +| **RETURNS** | Whether or not the output dimension of the model can be changed after initialization. ~~bool~~ | + +## Pipe.set_output {#set_output tag="method"} + +Change the output dimension of the component's model. If the component is not +[resizable](#is_resizable), this method will raise a `NotImplementedError`. If a +component is resizable, the model's attribute `resize_output` will be called. +This is a function that takes the original model and the new output dimension +`nO`, and changes the model in place. When resizing an already trained model, +care should be taken to avoid the "catastrophic forgetting" problem. + +> #### Example +> +> ```python +> if pipe.is_resizable(): +> pipe.set_output(512) +> ``` + +| Name | Description | +| ---- | --------------------------------- | +| `nO` | The new output dimension. ~~int~~ | ## Pipe.to_disk {#to_disk tag="method"} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index 3d9f61e8d..ca19327bb 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -34,7 +34,7 @@ architectures and their arguments and hyperparameters. | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/senter.pyx +%%GITHUB_SPACY/spacy/pipeline/senter.pyx ``` ## SentenceRecognizer.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index 8104b1151..c435acdcb 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -33,7 +33,7 @@ how the component should be configured. You can override its settings via the | `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/sentencizer.pyx +%%GITHUB_SPACY/spacy/pipeline/sentencizer.pyx ``` ## Sentencizer.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index af0e3af3c..d83a77357 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -34,7 +34,7 @@ architectures and their arguments and hyperparameters. | `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx +%%GITHUB_SPACY/spacy/pipeline/tagger.pyx ``` ## Tagger.\_\_init\_\_ {#init tag="method"} @@ -249,9 +249,9 @@ Score a batch of examples. > scores = tagger.score(examples) > ``` -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | | **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Dict[str, float]~~ | ## Tagger.create_optimizer {#create_optimizer tag="method"} @@ -288,7 +288,13 @@ context, the original parameters are restored. ## Tagger.add_label {#add_label tag="method"} -Add a new label to the pipe. +Add a new label to the pipe. Raises an error if the output dimension is already +set, or if the model has already been fully [initialized](#begin_training). Note +that you don't have to call this method if you provide a **representative data +sample** to the [`begin_training`](#begin_training) method. In this case, all +labels found in the sample will be automatically added to the model, and the +output dimension will be +[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. > #### Example > diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 927ac5411..cc20d6fd2 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -41,7 +41,7 @@ architectures and their arguments and hyperparameters. | `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py +%%GITHUB_SPACY/spacy/pipeline/textcat.py ``` ## TextCategorizer.\_\_init\_\_ {#init tag="method"} @@ -297,7 +297,13 @@ Modify the pipe's model, to use the given parameter values. ## TextCategorizer.add_label {#add_label tag="method"} -Add a new label to the pipe. +Add a new label to the pipe. Raises an error if the output dimension is already +set, or if the model has already been fully [initialized](#begin_training). Note +that you don't have to call this method if you provide a **representative data +sample** to the [`begin_training`](#begin_training) method. In this case, all +labels found in the sample will be automatically added to the model, and the +output dimension will be +[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. > #### Example > diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index deb8369ab..6f13a17a5 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters. | `model` | The model to use. Defaults to [HashEmbedCNN](/api/architectures#HashEmbedCNN). ~~Model[List[Doc], List[Floats2d]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py +%%GITHUB_SPACY/spacy/pipeline/tok2vec.py ``` ## Tok2Vec.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 7f2eb2e66..38e2299fa 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -105,8 +105,7 @@ your installation, installed pipelines and local setup from within spaCy. ### spacy.explain {#spacy.explain tag="function"} Get a description for a given POS tag, dependency label or entity type. For a -list of available terms, see -[`glossary.py`](https://github.com/explosion/spaCy/tree/master/spacy/glossary.py). +list of available terms, see [`glossary.py`](%%GITHUB_SPACY/spacy/glossary.py). > #### Example > @@ -262,11 +261,11 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Description | -| --------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | -| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | -| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | +| Name | Description | +| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | +| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | By default, displaCy comes with colors for all entity types used by [spaCy's trained pipelines](/models). If you're using custom entity types, you @@ -348,7 +347,7 @@ See the [`Transformer`](/api/transformer) API reference and | [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. | | [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | -## Loggers {#loggers source="spacy/gold/loggers.py" new="3"} +## Loggers {#loggers source="spacy/training/loggers.py" new="3"} A logger records the training results. When a logger is created, two functions are returned: one for logging the information for each training step, and a @@ -452,7 +451,7 @@ remain in the config file stored on your local system. | `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | -## Batchers {#batchers source="spacy/gold/batchers.py" new="3"} +## Batchers {#batchers source="spacy/training/batchers.py" new="3"} A data batcher implements a batching strategy that essentially turns a stream of items into a stream of batches, with each batch consisting of one item or a list @@ -536,9 +535,9 @@ sequences in the batch. | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | -## Training data and alignment {#gold source="spacy/gold"} +## Training data and alignment {#gold source="spacy/training"} -### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} +### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} Encode labelled spans into per-token tags, using the [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, @@ -554,7 +553,7 @@ single-token entity. > #### Example > > ```python -> from spacy.gold import biluo_tags_from_offsets +> from spacy.training import biluo_tags_from_offsets > > doc = nlp("I like London.") > entities = [(7, 13, "LOC")] @@ -568,7 +567,7 @@ single-token entity. | `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ | | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ | -### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} +### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} Encode per-token tags following the [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets. @@ -576,7 +575,7 @@ Encode per-token tags following the > #### Example > > ```python -> from spacy.gold import offsets_from_biluo_tags +> from spacy.training import offsets_from_biluo_tags > > doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] @@ -590,7 +589,7 @@ Encode per-token tags following the | `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ | -### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} +### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} Encode per-token tags following the [BILUO scheme](/usage/linguistic-features#accessing-ner) into @@ -600,7 +599,7 @@ token-based tags, e.g. to overwrite the `doc.ents`. > #### Example > > ```python -> from spacy.gold import spans_from_biluo_tags +> from spacy.training import spans_from_biluo_tags > > doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] @@ -616,12 +615,12 @@ token-based tags, e.g. to overwrite the `doc.ents`. ## Utility functions {#util source="spacy/util.py"} spaCy comes with a small collection of utility functions located in -[`spacy/util.py`](https://github.com/explosion/spaCy/tree/master/spacy/util.py). -Because utility functions are mostly intended for **internal use within spaCy**, -their behavior may change with future releases. The functions documented on this -page should be safe to use and we'll try to ensure backwards compatibility. -However, we recommend having additional tests in place if your application -depends on any of spaCy's utilities. +[`spacy/util.py`](%%GITHUB_SPACY/spacy/util.py). Because utility functions are +mostly intended for **internal use within spaCy**, their behavior may change +with future releases. The functions documented on this page should be safe to +use and we'll try to ensure backwards compatibility. However, we recommend +having additional tests in place if your application depends on any of spaCy's +utilities. ### util.get_lang_class {#util.get_lang_class tag="function"} @@ -832,10 +831,10 @@ Compile a sequence of prefix rules into a regex object. > nlp.tokenizer.prefix_search = prefix_regex.search > ``` -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | +| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"} @@ -849,10 +848,10 @@ Compile a sequence of suffix rules into a regex object. > nlp.tokenizer.suffix_search = suffix_regex.search > ``` -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | +| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.compile_infix_regex {#util.compile_infix_regex tag="function"} @@ -866,10 +865,10 @@ Compile a sequence of infix rules into a regex object. > nlp.tokenizer.infix_finditer = infix_regex.finditer > ``` -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- | +| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | +| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.minibatch {#util.minibatch tag="function" new="2"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index b41a18890..d5bcef229 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -31,7 +31,7 @@ supports all models that are available via the Usually you will connect subsequent components to the shared transformer using the [TransformerListener](/api/architectures#TransformerListener) layer. This works similarly to spaCy's [Tok2Vec](/api/tok2vec) component and -[Tok2VecListener](/api/architectures/Tok2VecListener) sublayer. +[Tok2VecListener](/api/architectures/#Tok2VecListener) sublayer. The component assigns the output of the transformer to the `Doc`'s extension attributes. We also calculate an alignment between the word-piece tokens and the @@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters. > nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > ``` -| Setting | Description | -| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ | -| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ | +| Setting | Description | +| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ | +| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ | ```python https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py @@ -102,14 +102,14 @@ attribute. You can also provide a callback to set additional annotations. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ | -| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | -| _keyword-only_ | | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ | +| Name | Description | +| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ | +| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | +| _keyword-only_ | | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ | ## Transformer.\_\_call\_\_ {#call tag="method"} @@ -205,7 +205,7 @@ modifying them. Assign the extracted features to the Doc objects. By default, the [`TransformerData`](/api/transformer#transformerdata) object is written to the -[`Doc._.trf_data`](#custom-attributes) attribute. Your annotation_setter +[`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations` callback is then called, if provided. > #### Example diff --git a/website/docs/images/prodigy_overview.jpg b/website/docs/images/prodigy_overview.jpg new file mode 100644 index 000000000..84326ccea Binary files /dev/null and b/website/docs/images/prodigy_overview.jpg differ diff --git a/website/docs/images/projects.png b/website/docs/images/projects.png new file mode 100644 index 000000000..934e98e0a Binary files /dev/null and b/website/docs/images/projects.png differ diff --git a/website/docs/images/wandb1.jpg b/website/docs/images/wandb1.jpg new file mode 100644 index 000000000..3baf4aba0 Binary files /dev/null and b/website/docs/images/wandb1.jpg differ diff --git a/website/docs/images/wandb2.jpg b/website/docs/images/wandb2.jpg new file mode 100644 index 000000000..cd67c9aa4 Binary files /dev/null and b/website/docs/images/wandb2.jpg differ diff --git a/website/docs/usage/101/_language-data.md b/website/docs/usage/101/_language-data.md index f1fa1f3a2..239cec9d1 100644 --- a/website/docs/usage/101/_language-data.md +++ b/website/docs/usage/101/_language-data.md @@ -2,9 +2,8 @@ Every language is different – and usually full of **exceptions and special cases**, especially amongst the most common words. Some of these exceptions are shared across languages, while others are **entirely specific** – usually so specific that they need to be hard-coded. The -[`lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) module -contains all language-specific data, organized in simple Python files. This -makes the data easy to update and extend. +[`lang`](%%GITHUB_SPACY/spacy/lang) module contains all language-specific data, +organized in simple Python files. This makes the data easy to update and extend. The **shared language data** in the directory root includes rules that can be generalized across languages – for example, rules for basic punctuation, emoji, @@ -22,28 +21,12 @@ values are defined in the [`Language.Defaults`](/api/language#defaults). > nlp_de = German() # Includes German data > ``` -| Name | Description | -| ----------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Stop words**
[`stop_words.py`][stop_words.py] | List of most common words of a language that are often useful to filter out, for example "and" or "I". Matching tokens will return `True` for `is_stop`. | -| **Tokenizer exceptions**
[`tokenizer_exceptions.py`][tokenizer_exceptions.py] | Special-case rules for the tokenizer, for example, contractions like "can't" and abbreviations with punctuation, like "U.K.". | -| **Punctuation rules**
[`punctuation.py`][punctuation.py] | Regular expressions for splitting tokens, e.g. on punctuation or special characters like emoji. Includes rules for prefixes, suffixes and infixes. | -| **Character classes**
[`char_classes.py`][char_classes.py] | Character classes to be used in regular expressions, for example, Latin characters, quotes, hyphens or icons. | -| **Lexical attributes**
[`lex_attrs.py`][lex_attrs.py] | Custom functions for setting lexical attributes on tokens, e.g. `like_num`, which includes language-specific words like "ten" or "hundred". | -| **Syntax iterators**
[`syntax_iterators.py`][syntax_iterators.py] | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks). | -| **Lemmatizer**
[`lemmatizer.py`][lemmatizer.py] [`spacy-lookups-data`][spacy-lookups-data] | Custom lemmatizer implementation and lemmatization tables. | - -[stop_words.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py -[tokenizer_exceptions.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/de/tokenizer_exceptions.py -[punctuation.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py -[char_classes.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py -[lex_attrs.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/lex_attrs.py -[syntax_iterators.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py -[lemmatizer.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/lemmatizer.py -[spacy-lookups-data]: https://github.com/explosion/spacy-lookups-data +| Name | Description | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Stop words**
[`stop_words.py`](%%GITHUB_SPACY/spacy/lang/en/stop_words.py) | List of most common words of a language that are often useful to filter out, for example "and" or "I". Matching tokens will return `True` for `is_stop`. | +| **Tokenizer exceptions**
[`tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/de/tokenizer_exceptions.py) | Special-case rules for the tokenizer, for example, contractions like "can't" and abbreviations with punctuation, like "U.K.". | +| **Punctuation rules**
[`punctuation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) | Regular expressions for splitting tokens, e.g. on punctuation or special characters like emoji. Includes rules for prefixes, suffixes and infixes. | +| **Character classes**
[`char_classes.py`](%%GITHUB_SPACY/spacy/lang/char_classes.py) | Character classes to be used in regular expressions, for example, Latin characters, quotes, hyphens or icons. | +| **Lexical attributes**
[`lex_attrs.py`](%%GITHUB_SPACY/spacy/lang/en/lex_attrs.py) | Custom functions for setting lexical attributes on tokens, e.g. `like_num`, which includes language-specific words like "ten" or "hundred". | +| **Syntax iterators**
[`syntax_iterators.py`](%%GITHUB_SPACY/spacy/lang/en/syntax_iterators.py) | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks). | +| **Lemmatizer**
[`lemmatizer.py`](%%GITHUB_SPACY/master/spacy/lang/fr/lemmatizer.py) [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) | Custom lemmatizer implementation and lemmatization tables. | diff --git a/website/docs/usage/_benchmarks-choi.md b/website/docs/usage/_benchmarks-choi.md deleted file mode 100644 index 47d6f479f..000000000 --- a/website/docs/usage/_benchmarks-choi.md +++ /dev/null @@ -1,10 +0,0 @@ -import { Help } from 'components/typography' - -| System | Year | Language | Accuracy | Speed (wps) | -| -------------- | ---- | --------------- | -------: | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| **spaCy v2.x** | 2017 | Python / Cython | **92.6** | _n/a_ This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware. | -| **spaCy v1.x** | 2015 | Python / Cython | 91.8 | 13,963 | -| ClearNLP | 2015 | Java | 91.7 | 10,271 | -| CoreNLP | 2015 | Java | 89.6 | 8,602 | -| MATE | 2015 | Java | 92.5 | 550 | -| Turbo | 2015 | C++ | 92.4 | 349 | diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md new file mode 100644 index 000000000..0c04dd8d5 --- /dev/null +++ b/website/docs/usage/_benchmarks-models.md @@ -0,0 +1,44 @@ +import { Help } from 'components/typography'; import Link from 'components/link' + + + +
+ +| System | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | +| ------------------------------------------------------------------------- | ----------------: | ----------------: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | +| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | +| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | +| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | | +| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | _n/a_2 | _n/a_2 | 88.8 | 234 | 2k | +| Flair | - | 97.9 | 89.3 | | | + +
+ +**Accuracy and speed on the +[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**
**1. ** +[Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). **2. ** _Coming soon_: +Qi et al. don't report parsing and tagging results on OntoNotes. We're working +on training Stanza on this corpus to allow direct comparison. + +
+ +
+ +
+ +| System | POS | USA | LAS | +| ------------------------------------------------------------------------------ | ---: | ---: | ---: | +| spaCy RoBERTa (2020) | | | | +| spaCy CNN (2020) | | | | +| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 | +| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 | + +
+ +**Accuracy on the Penn Treebank.** See +[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more +results. + +
+ +
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index abd92a8ac..8dd104ead 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -253,10 +253,10 @@ for doc in nlp.pipe(["some text", "some other text"]): You can also customize how the [`Transformer`](/api/transformer) component sets annotations onto the [`Doc`](/api/doc), by specifying a custom -`annotation_setter`. This callback will be called with the raw input and output -data for the whole batch, along with the batch of `Doc` objects, allowing you to -implement whatever you need. The annotation setter is called with a batch of -[`Doc`](/api/doc) objects and a +`set_extra_annotations` function. This callback will be called with the raw +input and output data for the whole batch, along with the batch of `Doc` +objects, allowing you to implement whatever you need. The annotation setter is +called with a batch of [`Doc`](/api/doc) objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) containing the transformers data for the batch. @@ -267,7 +267,7 @@ def custom_annotation_setter(docs, trf_data): doc._.custom_attr = data nlp = spacy.load("en_core_trf_lg") -nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter +nlp.get_pipe("transformer").set_extra_annotations = custom_annotation_setter doc = nlp("This is a text") assert isinstance(doc._.custom_attr, TransformerData) print(doc._.custom_attr.tensors) @@ -314,7 +314,7 @@ component: > get_spans=get_doc_spans, > tokenizer_config={"use_fast": True}, > ), -> annotation_setter=null_annotation_setter, +> set_extra_annotations=null_annotation_setter, > max_batch_items=4096, > ) > ``` @@ -333,7 +333,7 @@ tokenizer_config = {"use_fast": true} [components.transformer.model.get_spans] @span_getters = "spacy-transformers.doc_spans.v1" -[components.transformer.annotation_setter] +[components.transformer.set_extra_annotations] @annotation_setters = "spacy-transformers.null_annotation_setter.v1" ``` @@ -579,12 +579,17 @@ def MyCustomVectors( ## Pretraining {#pretraining} + + + + > #### Raw text format > diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index e2549ecfc..36f86dd51 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -5,254 +5,73 @@ next: /usage/spacy-101 menu: - ['Feature Comparison', 'comparison'] - ['Benchmarks', 'benchmarks'] + # TODO: - ['Citing spaCy', 'citation'] --- -## Feature comparison {#comparison} +## Comparison {#comparison hidden="true"} -Here's a quick comparison of the functionalities offered by spaCy, -[NLTK](http://www.nltk.org/py-modindex.html) and -[CoreNLP](http://stanfordnlp.github.io/CoreNLP/). +### When should I use spaCy? {#comparison-usage} -| | spaCy | NLTK | CoreNLP | -| ----------------------- | :----: | :----: | :-----------: | -| Programming language | Python | Python | Java / Python | -| Neural network models | ✅ | ❌ | ✅ | -| Integrated word vectors | ✅ | ❌ | ❌ | -| Multi-language support | ✅ | ✅ | ✅ | -| Tokenization | ✅ | ✅ | ✅ | -| Part-of-speech tagging | ✅ | ✅ | ✅ | -| Sentence segmentation | ✅ | ✅ | ✅ | -| Dependency parsing | ✅ | ❌ | ✅ | -| Entity recognition | ✅ | ✅ | ✅ | -| Entity linking | ✅ | ❌ | ❌ | -| Coreference resolution | ❌ | ❌ | ✅ | - -### When should I use what? {#comparison-usage} - -Natural Language Understanding is an active area of research and development, so -there are many different tools or technologies catering to different use-cases. -The table below summarizes a few libraries (spaCy, -[NLTK](http://www.nltk.org/py-modindex.html), [AllenNLP](https://allennlp.org/), -[StanfordNLP](https://stanfordnlp.github.io/stanfordnlp/) and -[TensorFlow](https://www.tensorflow.org/)) to help you get a feel for things fit -together. - -| | spaCy | NLTK | Allen-
NLP | Stanford-
NLP | Tensor-
Flow | -| ----------------------------------------------------------------- | :---: | :--: | :-------------: | :----------------: | :---------------: | -| I'm a beginner and just getting started with NLP. | ✅ | ✅ | ❌ | ✅ | ❌ | -| I want to build an end-to-end production application. | ✅ | ❌ | ❌ | ❌ | ✅ | -| I want to try out different neural network architectures for NLP. | ❌ | ❌ | ✅ | ❌ | ✅ | -| I want to try the latest models with state-of-the-art accuracy. | ❌ | ❌ | ✅ | ✅ | ✅ | -| I want to train models from my own data. | ✅ | ✅ | ✅ | ✅ | ✅ | -| I want my application to be efficient on CPU. | ✅ | ✅ | ❌ | ❌ | ❌ | +- ✅ **I'm a beginner and just getting started with NLP.** – spaCy makes it easy + to get started and comes with extensive documentation, including a + beginner-friendly [101 guide](/usage/spacy-101), a free interactive + [online course](https://course.spacy.io) and a range of + [video tutorials](https://www.youtube.com/c/ExplosionAI). +- ✅ **I want to build an end-to-end production application.** – spaCy is + specifically designed for production use and lets you build and train powerful + NLP pipelines and package them for easy deployment. +- ✅ **I want my application to be efficient on GPU _and_ CPU.** – While spaCy + lets you train modern NLP models that are best run on GPU, it also offers + CPU-optimized pipelines, which are less accurate but much cheaper to run. +- ✅ **I want to try out different neural network architectures for NLP.** – + spaCy lets you customize and swap out the model architectures powering its + components, and implement your own using a framework like PyTorch or + TensorFlow. The declarative configuration system makes it easy to mix and + match functions and keep track of your hyperparameters to make sure your + experiments are reproducible. +- ❌ **I want to build a language generation application.** – spaCy's focus is + natural language _processing_ and extracting information from large volumes of + text. While you can use it to help you re-write existing text, it doesn't + include any specific functionality for language generation tasks. +- ❌ **I want to research machine learning algorithms.** spaCy is built on the + latest research, but it's not a research library. If your goal is to write + papers and run benchmarks, spaCy is probably not a good choice. However, you + can use it to make the results of your research easily available for others to + use, e.g. via a custom spaCy component. ## Benchmarks {#benchmarks} -Two peer-reviewed papers in 2015 confirmed that spaCy offers the **fastest -syntactic parser in the world** and that **its accuracy is within 1% of the -best** available. The few systems that are more accurate are 20× slower or more. +spaCy v3.0 introduces transformer-based pipelines that bring spaCy's accuracy +right up to **current state-of-the-art**. You can also use a CPU-optimized +pipeline, which is less accurate but much cheaper to run. -> #### About the evaluation + + +> #### Evaluation details > -> The first of the evaluations was published by **Yahoo! Labs** and **Emory -> University**, as part of a survey of current parsing technologies -> ([Choi et al., 2015](https://aclweb.org/anthology/P/P15/P15-1038.pdf)). Their -> results and subsequent discussions helped us develop a novel -> psychologically-motivated technique to improve spaCy's accuracy, which we -> published in joint work with Macquarie University -> ([Honnibal and Johnson, 2015](https://www.aclweb.org/anthology/D/D15/D15-1162.pdf)). +> - **OntoNotes 5.0:** spaCy's English models are trained on this corpus, as +> it's several times larger than other English treebanks. However, most +> systems do not report accuracies on it. +> - **Penn Treebank:** The "classic" parsing evaluation for research. However, +> it's quite far removed from actual usage: it uses sentences with +> gold-standard segmentation and tokenization, from a pretty specific type of +> text (articles from a single newspaper, 1984-1989). -import BenchmarksChoi from 'usage/\_benchmarks-choi.md' +import Benchmarks from 'usage/\_benchmarks-models.md' - + -### Algorithm comparison {#algorithm} + -| System | Year | Type | Accuracy | -| ------------------------------------------------------------ | ---- | ------ | --------: | -| spaCy v2.0.0 | 2017 | neural | 94.48 | -| spaCy v1.1.0 | 2016 | linear | 92.80 | -| [Dozat and Manning][dozat and manning] | 2017 | neural | **95.75** | -| [Andor et al.][andor et al.] | 2016 | neural | 94.44 | -| [SyntaxNet Parsey McParseface][syntaxnet parsey mcparseface] | 2016 | neural | 94.15 | -| [Weiss et al.][weiss et al.] | 2015 | neural | 93.91 | -| [Zhang and McDonald][zhang and mcdonald] | 2014 | linear | 93.32 | -| [Martins et al.][martins et al.] | 2013 | linear | 93.10 | + diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index ee5fd0a3b..170e16591 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -166,10 +166,9 @@ $ python setup.py build_ext --inplace # compile spaCy ``` Compared to regular install via pip, the -[`requirements.txt`](https://github.com/explosion/spaCy/tree/master/requirements.txt) -additionally installs developer dependencies such as Cython. See the -[quickstart widget](#quickstart) to get the right commands for your platform and -Python version. +[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally installs +developer dependencies such as Cython. See the [quickstart widget](#quickstart) +to get the right commands for your platform and Python version. #### Ubuntu {#source-ubuntu} @@ -195,16 +194,14 @@ that matches the version that was used to compile your Python interpreter. ### Run tests {#run-tests} -spaCy comes with an -[extensive test suite](https://github.com/explosion/spaCy/tree/master/spacy/tests). -In order to run the tests, you'll usually want to clone the -[repository](https://github.com/explosion/spaCy/tree/master/) and -[build spaCy from source](#source). This will also install the required +spaCy comes with an [extensive test suite](%%GITHUB_SPACY/spacy/tests). In order +to run the tests, you'll usually want to clone the [repository](%%GITHUB_SPACY) +and [build spaCy from source](#source). This will also install the required development dependencies and test utilities defined in the `requirements.txt`. Alternatively, you can find out where spaCy is installed and run `pytest` on that directory. Don't forget to also install the test utilities via spaCy's -[`requirements.txt`](https://github.com/explosion/spaCy/tree/master/requirements.txt): +[`requirements.txt`](%%GITHUB_SPACY/requirements.txt): ```bash $ python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))" diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index e24b776c8..aefc64ece 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -5,7 +5,7 @@ menu: - ['Type Signatures', 'type-sigs'] - ['Swapping Architectures', 'swap-architectures'] - ['PyTorch & TensorFlow', 'frameworks'] - - ['Thinc Models', 'thinc'] + - ['Custom Thinc Models', 'thinc'] - ['Trainable Components', 'components'] next: /usage/projects --- @@ -28,9 +28,9 @@ A **model architecture** is a function that wires up a neural network that is run internally as part of a component in a spaCy pipeline. To define the actual architecture, you can implement your logic in Thinc directly, or you can use Thinc as a thin wrapper around frameworks such as -PyTorch, TensorFlow and MXNet. Each Model can also be used as a sublayer of a +PyTorch, TensorFlow and MXNet. Each `Model` can also be used as a sublayer of a larger network, allowing you to freely combine implementations from different -frameworks into one `Thinc` Model. +frameworks into a single model. spaCy's built-in components require a `Model` instance to be passed to them via the config system. To change the model architecture of an existing component, @@ -118,7 +118,7 @@ code. If no model is specified for the [`TextCategorizer`](/api/textcategorizer), the [TextCatEnsemble](/api/architectures#TextCatEnsemble) architecture is used by -default. This architecture combines a simpel bag-of-words model with a neural +default. This architecture combines a simple bag-of-words model with a neural network, usually resulting in the most accurate results, but at the cost of speed. The config file for this model would look something like this: @@ -225,31 +225,266 @@ you'll be able to try it out in any of the spaCy components. ​ Thinc allows you to [wrap models](https://thinc.ai/docs/usage-frameworks) written in other machine learning frameworks like PyTorch, TensorFlow and MXNet -using a unified [`Model`](https://thinc.ai/docs/api-model) API. As well as -**wrapping whole models**, Thinc lets you call into an external framework for -just **part of your model**: you can have a model where you use PyTorch just for -the transformer layers, using "native" Thinc layers to do fiddly input and -output transformations and add on task-specific "heads", as efficiency is less -of a consideration for those parts of the network. +using a unified [`Model`](https://thinc.ai/docs/api-model) API. This makes it +easy to use a model implemented in a different framework to power a component in +your spaCy pipeline. For example, to wrap a PyTorch model as a Thinc `Model`, +you can use Thinc's +[`PyTorchWrapper`](https://thinc.ai/docs/api-layers#pytorchwrapper): - +```python +from thinc.api import PyTorchWrapper -## Implementing models in Thinc {#thinc} +wrapped_pt_model = PyTorchWrapper(torch_model) +``` - +Let's use PyTorch to define a very simple neural network consisting of two +hidden `Linear` layers with `ReLU` activation and dropout, and a +softmax-activated output layer: -## Models for trainable components {#components} +```python +### PyTorch model +from torch import nn + +torch_model = nn.Sequential( + nn.Linear(width, hidden_width), + nn.ReLU(), + nn.Dropout2d(dropout), + nn.Linear(hidden_width, nO), + nn.ReLU(), + nn.Dropout2d(dropout), + nn.Softmax(dim=1) +) +``` + +The resulting wrapped `Model` can be used as a **custom architecture** as such, +or can be a **subcomponent of a larger model**. For instance, we can use Thinc's +[`chain`](https://thinc.ai/docs/api-layers#chain) combinator, which works like +`Sequential` in PyTorch, to combine the wrapped model with other components in a +larger network. This effectively means that you can easily wrap different +components from different frameworks, and "glue" them together with Thinc: + +```python +from thinc.api import chain, with_array, PyTorchWrapper +from spacy.ml import CharacterEmbed + +wrapped_pt_model = PyTorchWrapper(torch_model) +char_embed = CharacterEmbed(width, embed_size, nM, nC) +model = chain(char_embed, with_array(wrapped_pt_model)) +``` + +In the above example, we have combined our custom PyTorch model with a character +embedding layer defined by spaCy. +[CharacterEmbed](/api/architectures#CharacterEmbed) returns a `Model` that takes +a ~~List[Doc]~~ as input, and outputs a ~~List[Floats2d]~~. To make sure that +the wrapped PyTorch model receives valid inputs, we use Thinc's +[`with_array`](https://thinc.ai/docs/api-layers#with_array) helper. + +You could also implement a model that only uses PyTorch for the transformer +layers, and "native" Thinc layers to do fiddly input and output transformations +and add on task-specific "heads", as efficiency is less of a consideration for +those parts of the network. + +### Using wrapped models {#frameworks-usage} + +To use our custom model including the PyTorch subnetwork, all we need to do is +register the architecture using the +[`architectures` registry](/api/top-level#registry). This will assign the +architecture a name so spaCy knows how to find it, and allows passing in +arguments like hyperparameters via the [config](/usage/training#config). The +full example then becomes: + +```python +### Registering the architecture {highlight="9"} +from typing import List +from thinc.types import Floats2d +from thinc.api import Model, PyTorchWrapper, chain, with_array +import spacy +from spacy.tokens.doc import Doc +from spacy.ml import CharacterEmbed +from torch import nn + +@spacy.registry.architectures("CustomTorchModel.v1") +def create_torch_model( + nO: int, + width: int, + hidden_width: int, + embed_size: int, + nM: int, + nC: int, + dropout: float, +) -> Model[List[Doc], List[Floats2d]]: + char_embed = CharacterEmbed(width, embed_size, nM, nC) + torch_model = nn.Sequential( + nn.Linear(width, hidden_width), + nn.ReLU(), + nn.Dropout2d(dropout), + nn.Linear(hidden_width, nO), + nn.ReLU(), + nn.Dropout2d(dropout), + nn.Softmax(dim=1) + ) + wrapped_pt_model = PyTorchWrapper(torch_model) + model = chain(char_embed, with_array(wrapped_pt_model)) + return model +``` + +The model definition can now be used in any existing trainable spaCy component, +by specifying it in the config file. In this configuration, all required +parameters for the various subcomponents of the custom architecture are passed +in as settings via the config. + +```ini +### config.cfg (excerpt) {highlight="5-5"} +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "CustomTorchModel.v1" +nO = 50 +width = 96 +hidden_width = 48 +embed_size = 2000 +nM = 64 +nC = 8 +dropout = 0.2 +``` + + + +Remember that it is best not to rely on any (hidden) default values, to ensure +that training configs are complete and experiments fully reproducible. + + + +## Custom models with Thinc {#thinc} + +Of course it's also possible to define the `Model` from the previous section +entirely in Thinc. The Thinc documentation provides details on the +[various layers](https://thinc.ai/docs/api-layers) and helper functions +available. Combinators can also be used to +[overload operators](https://thinc.ai/docs/usage-models#operators) and a common +usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our +simple neural network would then become: + +```python +from thinc.api import chain, with_array, Model, Relu, Dropout, Softmax +from spacy.ml import CharacterEmbed + +char_embed = CharacterEmbed(width, embed_size, nM, nC) +with Model.define_operators({">>": chain}): + layers = ( + Relu(hidden_width, width) + >> Dropout(dropout) + >> Relu(hidden_width, hidden_width) + >> Dropout(dropout) + >> Softmax(nO, hidden_width) + ) + model = char_embed >> with_array(layers) +``` + + + +Note that Thinc layers define the output dimension (`nO`) as the first argument, +followed (optionally) by the input dimension (`nI`). This is in contrast to how +the PyTorch layers are defined, where `in_features` precedes `out_features`. + + + +### Shape inference in Thinc {#thinc-shape-inference} + +It is **not** strictly necessary to define all the input and output dimensions +for each layer, as Thinc can perform +[shape inference](https://thinc.ai/docs/usage-models#validation) between +sequential layers by matching up the output dimensionality of one layer to the +input dimensionality of the next. This means that we can simplify the `layers` +definition: + +> #### Diff +> +> ```diff +> layers = ( +> Relu(hidden_width, width) +> >> Dropout(dropout) +> - >> Relu(hidden_width, hidden_width) +> + >> Relu(hidden_width) +> >> Dropout(dropout) +> - >> Softmax(nO, hidden_width) +> + >> Softmax(nO) +> ) +> ``` + +```python +with Model.define_operators({">>": chain}): + layers = ( + Relu(hidden_width, width) + >> Dropout(dropout) + >> Relu(hidden_width) + >> Dropout(dropout) + >> Softmax(nO) + ) +``` + +Thinc can even go one step further and **deduce the correct input dimension** of +the first layer, and output dimension of the last. To enable this functionality, +you have to call +[`Model.initialize`](https://thinc.ai/docs/api-model#initialize) with an **input +sample** `X` and an **output sample** `Y` with the correct dimensions: + +```python +### Shape inference with initialization {highlight="3,7,10"} +with Model.define_operators({">>": chain}): + layers = ( + Relu(hidden_width) + >> Dropout(dropout) + >> Relu(hidden_width) + >> Dropout(dropout) + >> Softmax() + ) + model = char_embed >> with_array(layers) + model.initialize(X=input_sample, Y=output_sample) +``` + +The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure +that their internal models are **always initialized** with appropriate sample +data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a +~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This +functionality is triggered when +[`nlp.begin_training`](/api/language#begin_training) is called. + +### Dropout and normalization in Thinc {#thinc-dropout-norm} + +Many of the available Thinc [layers](https://thinc.ai/docs/api-layers) allow you +to define a `dropout` argument that will result in "chaining" an additional +[`Dropout`](https://thinc.ai/docs/api-layers#dropout) layer. Optionally, you can +often specify whether or not you want to add layer normalization, which would +result in an additional +[`LayerNorm`](https://thinc.ai/docs/api-layers#layernorm) layer. That means that +the following `layers` definition is equivalent to the previous: + +```python +with Model.define_operators({">>": chain}): + layers = ( + Relu(hidden_width, dropout=dropout, normalize=False) + >> Relu(hidden_width, dropout=dropout, normalize=False) + >> Softmax() + ) + model = char_embed >> with_array(layers) + model.initialize(X=input_sample, Y=output_sample) +``` + +## Create new trainable components {#components} + + + -![Diagram of a pipeline component with its model](../images/layers-architectures.svg) + diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index b36e9b71f..a229c18e9 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -854,24 +854,22 @@ The algorithm can be summarized as follows: **Global** and **language-specific** tokenizer data is supplied via the language -data in -[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang). The -tokenizer exceptions define special cases like "don't" in English, which needs -to be split into two tokens: `{ORTH: "do"}` and `{ORTH: "n't", NORM: "not"}`. -The prefixes, suffixes and infixes mostly define punctuation rules – for -example, when to split off periods (at the end of a sentence), and when to leave -tokens containing periods intact (abbreviations like "U.S."). +data in [`spacy/lang`](%%GITHUB_SPACY/spacy/lang). The tokenizer exceptions +define special cases like "don't" in English, which needs to be split into two +tokens: `{ORTH: "do"}` and `{ORTH: "n't", NORM: "not"}`. The prefixes, suffixes +and infixes mostly define punctuation rules – for example, when to split off +periods (at the end of a sentence), and when to leave tokens containing periods +intact (abbreviations like "U.S."). Tokenization rules that are specific to one language, but can be **generalized across that language** should ideally live in the language data in -[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) – we -always appreciate pull requests! Anything that's specific to a domain or text -type – like financial trading abbreviations, or Bavarian youth slang – should be -added as a special case rule to your tokenizer instance. If you're dealing with -a lot of customizations, it might make sense to create an entirely custom -subclass. +[`spacy/lang`](%%GITHUB_SPACY/spacy/lang) – we always appreciate pull requests! +Anything that's specific to a domain or text type – like financial trading +abbreviations, or Bavarian youth slang – should be added as a special case rule +to your tokenizer instance. If you're dealing with a lot of customizations, it +might make sense to create an entirely custom subclass. @@ -1059,7 +1057,7 @@ but also detailed regular expressions that take the surrounding context into account. For example, there is a regular expression that treats a hyphen between letters as an infix. If you do not want the tokenizer to split on hyphens between letters, you can modify the existing infix definition from -[`lang/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/punctuation.py): +[`lang/punctuation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py): ```python ### {executable="true"} @@ -1096,10 +1094,10 @@ print([t.text for t in doc]) # ['mother-in-law'] ``` For an overview of the default regular expressions, see -[`lang/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/punctuation.py) -and language-specific definitions such as -[`lang/de/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/de/punctuation.py) -for German. +[`lang/punctuation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) and +language-specific definitions such as +[`lang/de/punctuation.py`](%%GITHUB_SPACY/spacy/lang/de/punctuation.py) for +German. ### Hooking a custom tokenizer into the pipeline {#custom-tokenizer} @@ -1366,7 +1364,7 @@ token. ```python ### {executable="true"} -from spacy.gold import Alignment +from spacy.training import Alignment other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 9b1e96e4e..e94cdfe9e 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -76,7 +76,7 @@ spaCy also supports pipelines trained on more than one language. This is especially useful for named entity recognition. The language ID used for multi-language or language-neutral pipelines is `xx`. The language class, a generic subclass containing only the base language data, can be found in -[`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx). +[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx). To train a pipeline using the neutral multi-language class, you can set `lang = "xx"` in your [training config](/usage/training#config). You can also diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 0da350f27..3d756215f 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1028,11 +1028,11 @@ plug fully custom machine learning components into your pipeline. You'll need the following: 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This - can be a model using [layers](https://thinc.ai/docs/api-layers) implemented - in Thinc, or a [wrapped model](https://thinc.ai/docs/usage-frameworks) - implemented in PyTorch, TensorFlow, MXNet or a fully custom solution. The - model must take a list of [`Doc`](/api/doc) objects as input and can have any - type of output. + can be a model using implemented in + [Thinc](/usage/layers-architectures#thinc), or a + [wrapped model](/usage/layers-architectures#frameworks) implemented in + PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a + list of [`Doc`](/api/doc) objects as input and can have any type of output. 2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least two methods: [`Pipe.predict`](/api/pipe#predict) and [`Pipe.set_annotations`](/api/pipe#set_annotations). @@ -1078,8 +1078,9 @@ _first_ create a `Model` from a [registered architecture](/api/architectures), validate its arguments and _then_ pass the object forward to the component. This means that the config can express very complex, nested trees of objects – but the objects don't have to pass the model settings all the way down to the -components. It also makes the components more **modular** and lets you swap -different architectures in your config, and re-use model definitions. +components. It also makes the components more **modular** and lets you +[swap](/usage/layers-architectures#swap-architectures) different architectures +in your config, and re-use model definitions. ```ini ### config.cfg (excerpt) @@ -1134,7 +1135,7 @@ loss is calculated and to add evaluation scores to the training output. For more details on how to implement your own trainable components and model architectures, and plug existing models implemented in PyTorch or TensorFlow into your spaCy pipeline, see the usage guide on -[layers and model architectures](/usage/layers-architectures#components). +[layers and model architectures](/usage/layers-architectures).
@@ -1500,7 +1501,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline component function and pass it the token texts from the `Doc` object received by the component. -The [`gold.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very +The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very helpful here, because it takes a `Doc` object and token-based BILUO tags and returns a sequence of `Span` objects in the `Doc` with added labels. So all your wrapper has to do is compute the entity spans and overwrite the `doc.ents`. @@ -1515,7 +1516,7 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`. ```python ### {highlight="1,8-9"} import your_custom_entity_recognizer -from spacy.gold import offsets_from_biluo_tags +from spacy.training import offsets_from_biluo_tags from spacy.language import Language @Language.component("custom_ner_wrapper") diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index b6688cd5d..81ddf40fb 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -65,6 +65,8 @@ project template and copies the files to a local directory. You can then run the project, e.g. to train a pipeline and edit the commands and scripts to build fully custom workflows. + + ```cli python -m spacy project clone some_example_project ``` @@ -217,7 +219,7 @@ pipelines. ```yaml -https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.yml +https://github.com/explosion/projects/tree/v3/tutorials/ner_fashion_brands/project.yml ``` | Section | Description | @@ -726,18 +728,21 @@ workflows, but only one can be tracked by DVC.
- + --- ### Prodigy {#prodigy} + + +The Prodigy integration will require a nightly version of Prodigy that supports +spaCy v3+. + + + [Prodigy](https://prodi.gy) is a modern annotation tool for creating training data for machine learning models, developed by us. It integrates with spaCy out-of-the-box and provides many different @@ -793,9 +798,7 @@ results. -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum -sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat -mattis pretium. + @@ -803,43 +806,34 @@ mattis pretium. ### Streamlit {#streamlit} - - -
- [Streamlit](https://streamlit.io) is a Python framework for building interactive data apps. The [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit) package helps you integrate spaCy visualizations into your Streamlit apps and quickly spin up demos to explore your pipelines interactively. It includes a full embedded visualizer, as well as individual components. -```bash -$ pip install spacy_streamlit -``` + -
+> #### Installation +> +> ```bash +> $ pip install "spacy_streamlit>=1.0.0a0" +> ``` ![](../images/spacy-streamlit.png) -
- Using [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit), your projects can easily define their own scripts that spin up an interactive visualizer, using the latest pipeline you trained, or a selection of pipelines -so you can compare their results. The following script starts an -[NER visualizer](/usage/visualizers#ent) and takes two positional command-line -argument you can pass in from your `config.yml`: a comma-separated list of paths -to load the pipelines from and an example text to use as the default text. +so you can compare their results. -```python -### scripts/visualize.py -import spacy_streamlit -import sys + -DEFAULT_TEXT = sys.argv[2] if len(sys.argv) >= 3 else "" -PIPELINES = [name.strip() for name in sys.argv[1].split(",")] -spacy_streamlit.visualize(PIPELINES, DEFAULT_TEXT, visualizers=["ner"]) -``` +Get started with spaCy and Streamlit using our project template. It includes a +script to spin up a custom visualizer and commands you can adjust to showcase +and explore your own custom trained pipelines. + + > #### Example usage > @@ -856,16 +850,16 @@ commands: script: - 'streamlit run ./scripts/visualize.py ./training/model-best "I like Adidas shoes."' deps: - - 'training/model-best' + - "training/model-best" ``` - +The following script is called from the `project.yml` and takes two positional +command-line argument: a comma-separated list of paths or packages to load the +pipelines from and an example text to use as the default text. -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum -sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat -mattis pretium. - - +```python +https://github.com/explosion/projects/blob/v3/integrations/streamlit/scripts/visualize.py +``` --- @@ -878,9 +872,13 @@ library for serving machine learning models and you can use it in your spaCy projects to quickly serve up a trained pipeline and make it available behind a REST API. -```python -# TODO: show an example that addresses some of the main concerns for serving ML (workers etc.) -``` + + +Get started with spaCy and FastAPI using our project template. It includes a +simple REST API for processing batches of text, and usage examples for how to +query your API from Python and JavaScript (Vanilla JS and React). + + > #### Example usage > @@ -891,32 +889,66 @@ REST API. ```yaml ### project.yml -commands: - - name: serve - help: "Serve the trained pipeline with FastAPI" + - name: "serve" + help: "Serve the models via a FastAPI REST API using the given host and port" script: - - 'python ./scripts/serve.py ./training/model-best' + - "uvicorn scripts.main:app --reload --host 127.0.0.1 --port 5000" deps: - - 'training/model-best' + - "scripts/main.py" no_skip: true ``` - +The script included in the template shows a simple REST API with a `POST` +endpoint that accepts batches of texts and returns batches of predictions, e.g. +named entities found in the documents. Type hints and +[`pydantic`](https://github.com/samuelcolvin/pydantic) are used to define the +expected data types. -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum -sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat -mattis pretium. - - +```python +https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.py +``` --- ### Ray {#ray} + + + --- ### Weights & Biases {#wandb} - +[Weights & Biases](https://www.wandb.com/) is a popular platform for experiment +tracking. spaCy integrates with it out-of-the-box via the +[`WandbLogger`](/api/top-level#WandbLogger), which you can add as the +`[training.logger]` block of your training [config](/usage/training#config). The +results of each step are then logged in your project, together with the full +**training config**. This means that _every_ hyperparameter, registered function +name and argument will be tracked and you'll be able to see the impact it has on +your results. + +> #### Example config +> +> ```ini +> [training.logger] +> @loggers = "spacy.WandbLogger.v1" +> project_name = "monitor_spacy_training" +> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"] +> ``` + +![Screenshot: Visualized training results](../images/wandb1.jpg) + +![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values') + + + +Get started with tracking your spaCy training runs in Weights & Biases using our +project template. It includes a simple config using the `WandbLogger`, as well +as a custom logger implementation you can adjust for your specific use case. + + + + diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 01d60ddb8..2d6159f3d 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -192,12 +192,11 @@ of [`Token`](/api/token). This means that all of the attributes that refer to computed properties can't be accessed. The uppercase attribute names like `LOWER` or `IS_PUNCT` refer to symbols from -the -[`spacy.attrs`](https://github.com/explosion/spaCy/tree/master/spacy/attrs.pyx) -enum table. They're passed into a function that essentially is a big case/switch -statement, to figure out which struct field to return. The same attribute -identifiers are used in [`Doc.to_array`](/api/doc#to_array), and a few other -places in the code where you need to describe fields like this. +the [`spacy.attrs`](%%GITHUB_SPACY/spacy/attrs.pyx) enum table. They're passed +into a function that essentially is a big case/switch statement, to figure out +which struct field to return. The same attribute identifiers are used in +[`Doc.to_array`](/api/doc#to_array), and a few other places in the code where +you need to describe fields like this. diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 9955e7d84..c0fe1323c 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -187,11 +187,11 @@ add to that data and saves and loads the data to and from a JSON file. > > To see custom serialization methods in action, check out the new > [`EntityRuler`](/api/entityruler) component and its -> [source](https://github.com/explosion/spaCy/tree/master/spacy/pipeline/entityruler.py). -> Patterns added to the component will be saved to a `.jsonl` file if the -> pipeline is serialized to disk, and to a bytestring if the pipeline is -> serialized to bytes. This allows saving out a pipeline with a rule-based -> entity recognizer and including all rules _with_ the component data. +> [source](%%GITHUB_SPACY/spacy/pipeline/entityruler.py). Patterns added to the +> component will be saved to a `.jsonl` file if the pipeline is serialized to +> disk, and to a bytestring if the pipeline is serialized to bytes. This allows +> saving out a pipeline with a rule-based entity recognizer and including all +> rules _with_ the component data. ```python ### {highlight="14-18,20-25"} diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 82fec4b6a..cd1b2cb0c 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -84,15 +84,13 @@ systems, or to pre-process text for **deep learning**. ### What spaCy isn't {#what-spacy-isnt} -- **spaCy is not a platform or "an API"**. Unlike a platform, spaCy does not +- ❌ **spaCy is not a platform or "an API"**. Unlike a platform, spaCy does not provide a software as a service, or a web application. It's an open-source library designed to help you build NLP applications, not a consumable service. - -- **spaCy is not an out-of-the-box chat bot engine**. While spaCy can be used to - power conversational applications, it's not designed specifically for chat +- ❌ **spaCy is not an out-of-the-box chat bot engine**. While spaCy can be used + to power conversational applications, it's not designed specifically for chat bots, and only provides the underlying text processing capabilities. - -- **spaCy is not research software**. It's built on the latest research, but +- ❌**spaCy is not research software**. It's built on the latest research, but it's designed to get things done. This leads to fairly different design decisions than [NLTK](https://github.com/nltk/nltk) or [CoreNLP](https://stanfordnlp.github.io/CoreNLP/), which were created as @@ -101,8 +99,7 @@ systems, or to pre-process text for **deep learning**. between multiple algorithms that deliver equivalent functionality. Keeping the menu small lets spaCy deliver generally better performance and developer experience. - -- **spaCy is not a company**. It's an open-source library. Our company +- ❌ **spaCy is not a company**. It's an open-source library. Our company publishing spaCy and other software is called [Explosion](https://explosion.ai). @@ -494,7 +491,7 @@ regressions to the parts of the library that you care about the most. **For more details on the types of contributions we're looking for, the code conventions and other useful tips, make sure to check out the -[contributing guidelines](https://github.com/explosion/spaCy/tree/master/CONTRIBUTING.md).** +[contributing guidelines](%%GITHUB_SPACY/CONTRIBUTING.md).** diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 9c18e4606..4b25d1c21 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -59,7 +59,7 @@ specific use case. It's also available in spaCy as the import QuickstartTraining from 'widgets/quickstart-training.js' - + After you've saved the starter config to a file `base_config.cfg`, you can use the [`init fill-config`](/api/cli#init-fill-config) command to fill in the @@ -127,7 +127,7 @@ Some of the main advantages and features of spaCy's training config are: config which types of data to expect. ```ini -https://github.com/explosion/spaCy/blob/develop/spacy/default_config.cfg +%%GITHUB_SPACY/spacy/default_config.cfg ``` Under the hood, the config is parsed into a dictionary. It's divided into @@ -683,7 +683,7 @@ You can also implement your own batch size schedule to use during training. The import spacy @spacy.registry.schedules("my_custom_schedule.v1") -def my_custom_schedule(start: int = 1, factor: int = 1.001): +def my_custom_schedule(start: int = 1, factor: float = 1.001): while True: yield start start = start * factor @@ -735,7 +735,7 @@ as **config settings** – in this case, `source`. ### functions.py {highlight="7-8"} from typing import Callable, Iterator, List import spacy -from spacy.gold import Example +from spacy.training import Example from spacy.language import Language import random @@ -783,7 +783,7 @@ annotations are the same. ### functions.py from typing import Callable, Iterable, Iterator, List import spacy -from spacy.gold import Example +from spacy.training import Example @spacy.registry.batchers("filtering_batch.v1") def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Example]]]: diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md index f7bcc17d3..aee3c24a6 100644 --- a/website/docs/usage/v2.md +++ b/website/docs/usage/v2.md @@ -76,9 +76,7 @@ noise contrastive estimation or reinforcement learning. ## New features {#features} This section contains an overview of the most important **new features and -improvements**. The [API docs](/api) include additional deprecation notes. New -methods and functions that were introduced in this version are marked with the -tag 2. +improvements**. The [API docs](/api) include additional deprecation notes. ### Convolutional neural network models {#features-models} diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 3cbccc352..791b641df 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -8,20 +8,30 @@ menu: - ['Migrating from v2.x', 'migrating'] --- -## Summary {#summary} +## Summary {#summary hidden="true"} - +
+spaCy v3.0 features all new **transformer-based pipelines** that bring spaCy's +accuracy right up to the current **state-of-the-art**. You can use any +pretrained transformer to train your own pipelines, and even share one +transformer between multiple components with **multi-task learning**. Training +is now fully configurable and extensible, and you can define your own custom +models using **PyTorch**, **TensorFlow** and other frameworks. The new spaCy +projects system lets you describe whole **end-to-end workflows** in a single +file, giving you an easy path from prototype to production, and making it easy +to clone and adapt best-practice projects for your own use cases. +
- [Summary](#summary) - [New features](#features) -- [Training & config system](#features-training) - [Transformer-based pipelines](#features-transformers) +- [Training & config system](#features-training) - [Custom models](#features-custom-models) - [End-to-end project workflows](#features-projects) - [New built-in components](#features-pipeline-components) @@ -39,47 +49,126 @@ menu: ## New Features {#features} -### New training workflow and config system {#features-training} - - - -- **Usage:** [Training pipelines and models](/usage/training) -- **Thinc:** [Thinc's config system](https://thinc.ai/docs/usage-config), - [`Config`](https://thinc.ai/docs/api-config#config) -- **CLI:** [`train`](/api/cli#train), [`pretrain`](/api/cli#pretrain), - [`evaluate`](/api/cli#evaluate) -- **API:** [Config format](/api/data-formats#config), - [`registry`](/api/top-level#registry) - - +This section contains an overview of the most important **new features and +improvements**. The [API docs](/api) include additional deprecation notes. New +methods and functions that were introduced in this version are marked with the +tag 3. ### Transformer-based pipelines {#features-transformers} +> #### Example +> +> ```cli +> $ python -m spacy download en_core_web_trf +> ``` + +spaCy v3.0 features all new transformer-based pipelines that bring spaCy's +accuracy right up to the current **state-of-the-art**. You can use any +pretrained transformer to train your own pipelines, and even share one +transformer between multiple components with **multi-task learning**. spaCy's +transformer support interoperates with [PyTorch](https://pytorch.org) and the +[HuggingFace `transformers`](https://huggingface.co/transformers/) library, +giving you access to thousands of pretrained models for your pipelines. + ![Pipeline components listening to shared embedding component](../images/tok2vec-listener.svg) +import Benchmarks from 'usage/\_benchmarks-models.md' + + + - **Usage:** [Embeddings & Transformers](/usage/embeddings-transformers), - [Training pipelines and models](/usage/training) + [Training pipelines and models](/usage/training), + [Benchmarks](/usage/facts-figures#benchmarks) - **API:** [`Transformer`](/api/transformer), [`TransformerData`](/api/transformer#transformerdata), [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel), [TransformerListener](/api/architectures#TransformerListener), [Tok2VecTransformer](/api/architectures#Tok2VecTransformer) -- **Trained Pipelines:** [`en_core_trf_lg_sm`](/models/en) +- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf) - **Implementation:** [`spacy-transformers`](https://github.com/explosion/spacy-transformers) +### New training workflow and config system {#features-training} + +> #### Example +> +> ```ini +> [training] +> vectors = null +> accumulate_gradient = 3 +> +> [training.optimizer] +> @optimizers = "Adam.v1" +> +> [training.optimizer.learn_rate] +> @schedules = "warmup_linear.v1" +> warmup_steps = 250 +> total_steps = 20000 +> initial_rate = 0.01 +> ``` + +spaCy v3.0 introduces a comprehensive and extensible system for **configuring +your training runs**. A single configuration file describes every detail of your +training run, with no hidden defaults, making it easy to rerun your experiments +and track changes. You can use the +[quickstart widget](/usage/training#quickstart) or the `init config` command to +get started. Instead of providing lots of arguments on the command line, you +only need to pass your `config.cfg` file to `spacy train`. + +Training config files include all **settings and hyperparameters** for training +your pipeline. Some settings can also be registered **functions** that you can +swap out and customize, making it easy to implement your own custom models and +architectures. + + + +- **Usage:** [Training pipelines and models](/usage/training) +- **Thinc:** [Thinc's config system](https://thinc.ai/docs/usage-config), + [`Config`](https://thinc.ai/docs/api-config#config) +- **CLI:** [`init config`](/api/cli#init-config), + [`init fill-config`](/api/cli#init-fill-config), [`train`](/api/cli#train), + [`pretrain`](/api/cli#pretrain), [`evaluate`](/api/cli#evaluate) +- **API:** [Config format](/api/data-formats#config), + [`registry`](/api/top-level#registry) + + + ### Custom models using any framework {#features-custom-models} +> #### Example +> +> ```python +> from torch import nn +> from thinc.api import PyTorchWrapper +> +> torch_model = nn.Sequential( +> nn.Linear(32, 32), +> nn.ReLU(), +> nn.Softmax(dim=1) +> ) +> model = PyTorchWrapper(torch_model) +> ``` + +spaCy's new configuration system makes it easy to customize the neural network +models used by the different pipeline components. You can also implement your +own architectures via spaCy's machine learning library [Thinc](https://thinc.ai) +that provides various layers and utilities, as well as thin wrappers around +frameworks like **PyTorch**, **TensorFlow** and **MXNet**. Component models all +follow the same unified [`Model`](https://thinc.ai/docs/api-model) API and each +`Model` can also be used as a sublayer of a larger network, allowing you to +freely combine implementations from different frameworks into a single model. + - **Usage: ** [Layers and architectures](/usage/layers-architectures) - **Thinc: ** - [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks) + [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks), + [`Model` API](https://thinc.ai/docs/api-model) - **API:** [Model architectures](/api/architectures), [`Pipe`](/api/pipe) @@ -159,8 +248,7 @@ add to your pipeline and customize for your use case: - **Usage:** [Processing pipelines](/usage/processing-pipelines) - **API:** [Built-in pipeline components](/api#architecture-pipeline) -- **Implementation:** - [`spacy/pipeline`](https://github.com/explosion/spaCy/tree/develop/spacy/pipeline) +- **Implementation:** [`spacy/pipeline`](%%GITHUB_SPACY/spacy/pipeline) @@ -197,15 +285,12 @@ aren't set. [`@Language.factory`](/api/language#factory), [`Language.add_pipe`](/api/language#add_pipe), [`Language.analyze_pipes`](/api/language#analyze_pipes) -- **Implementation:** - [`spacy/language.py`](https://github.com/explosion/spaCy/tree/develop/spacy/language.py) +- **Implementation:** [`spacy/language.py`](%%GITHUB_SPACY/spacy/language.py)
### Dependency matching {#features-dep-matcher} - - > #### Example > > ```python @@ -233,7 +318,7 @@ dictionaries**, with each dictionary describing a **token to match** and its [Dependency matching](/usage/rule-based-matching#dependencymatcher), - **API:** [`DependencyMatcher`](/api/dependencymatcher), - **Implementation:** - [`spacy/matcher/dependencymatcher.pyx`](https://github.com/explosion/spaCy/tree/develop/spacy/matcher/dependencymatcher.pyx) + [`spacy/matcher/dependencymatcher.pyx`](%%GITHUB_SPACY/spacy/matcher/dependencymatcher.pyx) @@ -404,11 +489,12 @@ Note that spaCy v3.0 now requires **Python 3.6+**. [`Pipe.begin_training`](/api/pipe#begin_training) now take a function that returns a sequence of `Example` objects to initialize the model instead of a list of tuples. -- [`Matcher.add`](/api/matcher#add), - [`PhraseMatcher.add`](/api/phrasematcher#add) and - [`DependencyMatcher.add`](/api/dependencymatcher#add) now only accept a list - of patterns as the second argument (instead of a variable number of - arguments). The `on_match` callback becomes an optional keyword argument. +- [`Matcher.add`](/api/matcher#add) and + [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of + patterns as the second argument (instead of a variable number of arguments). + The `on_match` callback becomes an optional keyword argument. +- The `spacy.gold` module has been renamed to + [`spacy.training`](%%GITHUB_SPACY/spacy/training). - The `PRON_LEMMA` symbol and `-PRON-` as an indicator for pronoun lemmas has been removed. - The `TAG_MAP` and `MORPH_RULES` in the language data have been replaced by the @@ -779,6 +865,20 @@ python -m spacy package ./output ./packages - python setup.py sdist ``` +#### Data utilities and gold module {#migrating-gold} + +The `spacy.gold` module has been renamed to `spacy.training`. This mostly +affects internals, but if you've been using the span offset conversion utilities +[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets), +[`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or +[`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to +change your imports: + +```diff +- from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags ++ from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags +``` + #### Migration notes for plugin maintainers {#migrating-plugins} Thanks to everyone who's been contributing to the spaCy ecosystem by developing diff --git a/website/gatsby-config.js b/website/gatsby-config.js index 6c67de6ea..78fdc336f 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -8,7 +8,6 @@ const codeBlocksPlugin = require('./src/plugins/remark-code-blocks.js') // Import metadata const site = require('./meta/site.json') -const logos = require('./meta/logos.json') const sidebars = require('./meta/sidebars.json') const models = require('./meta/languages.json') const universe = require('./meta/universe.json') @@ -20,11 +19,16 @@ const favicon = isNightly ? `src/images/icon_nightly.png` : `src/images/icon.png const binderBranch = isNightly ? 'nightly' : site.binderBranch const siteUrl = isNightly ? site.siteUrlNightly : site.siteUrl const domain = isNightly ? site.domainNightly : site.domain +const branch = isNightly ? 'develop' : 'master' + +// Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY +const replacements = { + GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`, +} module.exports = { siteMetadata: { ...site, - ...logos, sidebars, ...models, universe, @@ -121,6 +125,13 @@ module.exports = { { resolve: `gatsby-remark-copy-linked-files`, }, + { + resolve: 'gatsby-remark-find-replace', + options: { + replacements, + prefix: '%%', + }, + }, ], }, }, diff --git a/website/meta/logos.json b/website/meta/logos.json deleted file mode 100644 index 783995026..000000000 --- a/website/meta/logos.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "logosUsers": [ - { "id": "airbnb", "url": "https://www.airbnb.com" }, - { "id": "uber", "url": "https://www.uber.com" }, - { "id": "quora", "url": "https://www.quora.com" }, - { "id": "retriever", "url": "https://www.retriever.no" }, - { "id": "stitchfix", "url": "https://www.stitchfix.com" }, - { "id": "chartbeat", "url": "https://chartbeat.com" }, - { "id": "allenai", "url": "https://allenai.org" } - ], - "logosPublications": [ - { - "id": "recode", - "url": "https://www.recode.net/2017/6/22/15855492/ai-artificial-intelligence-nonprofit-good-human-chatbots-machine-learning" - }, - { - "id": "wapo", - "url": "https://www.washingtonpost.com/news/wonk/wp/2016/05/18/googles-new-artificial-intelligence-cant-understand-these-sentences-can-you/" - }, - { - "id": "bbc", - "url": "http://www.bbc.co.uk/rd/blog/2017-08-irfs-weeknotes-number-250" - }, - { - "id": "microsoft", - "url": "https://www.microsoft.com/developerblog/2016/09/13/training-a-classifier-for-relation-extraction-from-medical-literature/" - }, - { - "id": "venturebeat", - "url": "https://venturebeat.com/2017/01/27/4-ai-startups-that-analyze-customer-reviews/" - }, - { - "id": "thoughtworks", - "url": "https://www.thoughtworks.com/radar/tools" - } - ] -} diff --git a/website/meta/site.json b/website/meta/site.json index d1162edf9..1955932b9 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -28,7 +28,7 @@ }, "binderUrl": "explosion/spacy-io-binder", "binderBranch": "live", - "binderVersion": "2.3.0", + "binderVersion": "3.0.0", "sections": [ { "id": "usage", "title": "Usage Documentation", "theme": "blue" }, { "id": "models", "title": "Models Documentation", "theme": "blue" }, @@ -47,20 +47,19 @@ "items": [ { "text": "Usage", "url": "/usage" }, { "text": "Models", "url": "/models" }, - { "text": "API", "url": "/api" }, - { "text": "Universe", "url": "/universe" } + { "text": "API Reference", "url": "/api" }, + { "text": "Online Course", "url": "https://course.spacy.io" } ] }, { - "label": "Support", + "label": "Community", "items": [ + { "text": "Universe", "url": "/universe" }, { "text": "Issue Tracker", "url": "https://github.com/explosion/spaCy/issues" }, { "text": "Stack Overflow", "url": "http://stackoverflow.com/questions/tagged/spacy" - }, - { "text": "Reddit User Group", "url": "https://www.reddit.com/r/spacynlp/" }, - { "text": "Gitter Chat", "url": "https://gitter.im/explosion/spaCy" } + } ] }, { diff --git a/website/meta/type-annotations.json b/website/meta/type-annotations.json index b1d94403d..79d4d357d 100644 --- a/website/meta/type-annotations.json +++ b/website/meta/type-annotations.json @@ -34,6 +34,8 @@ "Floats2d": "https://thinc.ai/docs/api-types#types", "Floats3d": "https://thinc.ai/docs/api-types#types", "FloatsXd": "https://thinc.ai/docs/api-types#types", + "Array1d": "https://thinc.ai/docs/api-types#types", + "Array2d": "https://thinc.ai/docs/api-types#types", "Ops": "https://thinc.ai/docs/api-backends#ops", "cymem.Pool": "https://github.com/explosion/cymem", "preshed.BloomFilter": "https://github.com/explosion/preshed", diff --git a/website/package-lock.json b/website/package-lock.json index d995f910e..d8444c2b2 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -14238,6 +14238,46 @@ } } }, + "gatsby-remark-find-replace": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/gatsby-remark-find-replace/-/gatsby-remark-find-replace-0.3.0.tgz", + "integrity": "sha512-tTXt+ZxD+7hEVtZVbZVrifcQUk2mt4uJNUHhc9cje+93sDa4PrrFBbny9IWgXLj9QH9xDxWOZrI768ApMtbPUQ==", + "requires": { + "escape-string-regexp": "^2.0.0", + "unist-util-visit": "^2.0.1" + }, + "dependencies": { + "escape-string-regexp": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-2.0.0.tgz", + "integrity": "sha512-UpzcLCXolUWcNu5HtVMHYdXJjArjsF9C0aNnquZYY4uW/Vu0miy5YoWvbV345HauVvcAUnpRuhMMcqTcGOY2+w==" + }, + "unist-util-is": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-4.0.2.tgz", + "integrity": "sha512-Ofx8uf6haexJwI1gxWMGg6I/dLnF2yE+KibhD3/diOqY2TinLcqHXCV6OI5gFVn3xQqDH+u0M625pfKwIwgBKQ==" + }, + "unist-util-visit": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-2.0.3.tgz", + "integrity": "sha512-iJ4/RczbJMkD0712mGktuGpm/U4By4FfDonL7N/9tATGIF4imikjOuagyMY53tnZq3NP6BcmlrHhEKAfGWjh7Q==", + "requires": { + "@types/unist": "^2.0.0", + "unist-util-is": "^4.0.0", + "unist-util-visit-parents": "^3.0.0" + } + }, + "unist-util-visit-parents": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-3.1.0.tgz", + "integrity": "sha512-0g4wbluTF93npyPrp/ymd3tCDTMnP0yo2akFD2FIBAYXq/Sga3lwaU1D8OYKbtpioaI6CkDcQ6fsMnmtzt7htw==", + "requires": { + "@types/unist": "^2.0.0", + "unist-util-is": "^4.0.0" + } + } + } + }, "gatsby-remark-images": { "version": "3.0.4", "resolved": "https://registry.npmjs.org/gatsby-remark-images/-/gatsby-remark-images-3.0.4.tgz", @@ -22152,6 +22192,14 @@ "clipboard": "^2.0.0" } }, + "prismjs-bibtex": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/prismjs-bibtex/-/prismjs-bibtex-1.1.0.tgz", + "integrity": "sha512-IjZUJP3iTkV1DZ8qcjUF7p7Ji/LPns56jw+INUBPtnBaX4Q/VhtzlRGHM0lxSvdfqUvqgTGF3oM8aogWqzZz2g==", + "requires": { + "prismjs": "^1.15" + } + }, "private": { "version": "0.1.8", "resolved": "https://registry.npmjs.org/private/-/private-0.1.8.tgz", diff --git a/website/package.json b/website/package.json index 40018f532..def94a9c2 100644 --- a/website/package.json +++ b/website/package.json @@ -31,6 +31,7 @@ "gatsby-plugin-sitemap": "^2.0.5", "gatsby-plugin-svgr": "^2.0.1", "gatsby-remark-copy-linked-files": "^2.0.9", + "gatsby-remark-find-replace": "^0.3.0", "gatsby-remark-images": "^3.0.4", "gatsby-remark-prismjs": "^3.2.4", "gatsby-remark-smartypants": "^2.0.8", @@ -44,6 +45,7 @@ "node-sass": "^4.11.0", "parse-numeric-range": "0.0.2", "prismjs": "^1.15.0", + "prismjs-bibtex": "^1.1.0", "prop-types": "^15.7.2", "react": "^16.8.2", "react-dom": "^16.8.2", diff --git a/website/src/components/code.js b/website/src/components/code.js index f075539ea..5a7828a33 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -2,6 +2,7 @@ import React, { Fragment } from 'react' import PropTypes from 'prop-types' import classNames from 'classnames' import highlightCode from 'gatsby-remark-prismjs/highlight-code.js' +import 'prismjs-bibtex' import rangeParser from 'parse-numeric-range' import { StaticQuery, graphql } from 'gatsby' import { window } from 'browser-monads' diff --git a/website/src/components/grid.js b/website/src/components/grid.js index 1d11a748f..299fcf931 100644 --- a/website/src/components/grid.js +++ b/website/src/components/grid.js @@ -9,6 +9,7 @@ export default function Grid({ narrow = false, gutterBottom = true, className, + style, children, }) { const gridClassNames = classNames(classes.root, className, { @@ -18,7 +19,11 @@ export default function Grid({ [classes.third]: cols === 3, [classes.quarter]: cols === 4, }) - return
{children}
+ return ( +
+ {children} +
+ ) } Grid.propTypes = { diff --git a/website/src/components/icon.js b/website/src/components/icon.js index 8dfba7426..799b20eda 100644 --- a/website/src/components/icon.js +++ b/website/src/components/icon.js @@ -1,4 +1,4 @@ -import React from 'react' +import React, { Fragment } from 'react' import PropTypes from 'prop-types' import classNames from 'classnames' @@ -25,6 +25,7 @@ import { ReactComponent as NetworkIcon } from '../images/icons/network.svg' import { ReactComponent as DownloadIcon } from '../images/icons/download.svg' import { ReactComponent as PackageIcon } from '../images/icons/package.svg' +import { isString } from './util' import classes from '../styles/icon.module.sass' const icons = { @@ -88,3 +89,44 @@ Icon.propTypes = { variant: PropTypes.oneOf(['success', 'error', 'subtle']), className: PropTypes.string, } + +export function replaceEmoji(cellChildren) { + const icons = { + '✅': { name: 'yes', variant: 'success', 'aria-label': 'positive' }, + '❌': { name: 'no', variant: 'error', 'aria-label': 'negative' }, + } + const iconRe = new RegExp(`^(${Object.keys(icons).join('|')})`, 'g') + let children = isString(cellChildren) ? [cellChildren] : cellChildren + let hasIcon = false + if (Array.isArray(children)) { + children = children.map((child, i) => { + if (isString(child)) { + const icon = icons[child.trim()] + if (icon) { + hasIcon = true + return ( + + ) + } else if (iconRe.test(child)) { + hasIcon = true + const [, iconName, text] = child.split(iconRe) + return ( + + + {text.replace(/^\s+/g, '')} + + ) + } + // Work around prettier auto-escape + if (child.startsWith('\\')) return child.slice(1) + } + return child + }) + } + return { content: children, hasIcon } +} diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js index 363638bf2..968b6cea8 100644 --- a/website/src/components/infobox.js +++ b/website/src/components/infobox.js @@ -23,7 +23,7 @@ export default function Infobox({