Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-11-04 09:57:26 +03:00 · 2020-06-29 19:55:24 +02:00 · 2020-06-29 19:55:24 +02:00 · 5bed6fc431
commit 5bed6fc431
parent da50473701 126050f259
24 changed files with 961 additions and 222 deletions
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@ -78,8 +78,7 @@ def read_data(
                head = int(head) - 1 if head != "0" else id_
                sent["words"].append(word)
                sent["tags"].append(tag)
-                sent["morphology"].append(_parse_morph_string(morph))
+                sent["morphs"].append(_compile_morph_string(morph, pos))
                sent["morphology"][-1].add("POS_%s" % pos)
                sent["heads"].append(head)
                sent["deps"].append("ROOT" if dep == "root" else dep)
                sent["spaces"].append(space_after == "_")
@ -88,12 +87,12 @@ def read_data(
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
                golds.append(sent)
-                assert golds[-1].morphology is not None
+                assert golds[-1]["morphs"] is not None
            sent_annots.append(sent)
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
                doc, gold = _make_gold(nlp, None, sent_annots)
-                assert gold.morphology is not None
+                assert gold["morphs"] is not None
                sent_annots = []
                docs.append(doc)
                golds.append(gold)
@ -109,17 +108,10 @@ def read_data(
    return golds_to_gold_data(docs, golds)
-def _parse_morph_string(morph_string):
+def _compile_morph_string(morph_string, pos):
    if morph_string == '_':
-        return set()
+        return f"POS={pos}"
-    output = []
+    return morph_string + f"|POS={pos}"
    replacements = {'1': 'one', '2': 'two', '3': 'three'}
    for feature in morph_string.split('|'):
        key, value = feature.split('=')
        value = replacements.get(value, value)
        value = value.split(',')[0]
        output.append('%s_%s' % (key, value.lower()))
    return set(output)
 def read_conllu(file_):
@ -155,7 +147,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
    sent_starts = []
    for sent in sent_annots:
        gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
-        for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
+        for field in ["words", "tags", "deps", "morphs", "entities", "spaces"]:
            gold[field].extend(sent[field])
        sent_starts.append(True)
        sent_starts.extend([False] * (len(sent["words"]) - 1))
@ -168,7 +160,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
    doc = nlp.make_doc(text)
    gold.pop("spaces")
    gold["sent_starts"] = sent_starts
-    for i in range(len(gold.heads)):
+    for i in range(len(gold["heads"])):
        if random.random() < drop_deps:
            gold["heads"][i] = None
            gold["labels"][i] = None
@ -185,7 +177,7 @@ def golds_to_gold_data(docs, golds):
    """Get out the training data format used by begin_training"""
    data = []
    for doc, gold in zip(docs, golds):
-        example = Example.from_dict(doc, gold)
+        example = Example.from_dict(doc, dict(gold))
        data.append(example)
    return data
@ -354,8 +346,7 @@ def initialize_pipeline(nlp, examples, config, device):
    if config.multitask_sent:
        nlp.parser.add_multitask_objective("sent_start")
    for eg in examples:
-        gold = eg.gold
+        for tag in eg.get_aligned("TAG", as_string=True):
        for tag in gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
    if torch is not None and device != -1:
@ -489,10 +480,6 @@ def main(
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)
    Token.set_extension("get_conllu_lines", method=get_token_conllu)
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)
    spacy.util.fix_random_seed()
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
@ -535,10 +522,10 @@ def main(
        else:
            batches = minibatch(examples, size=batch_sizes)
        losses = {}
-        n_train_words = sum(len(eg.doc) for eg in examples)
+        n_train_words = sum(len(eg.predicted) for eg in examples)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
-                pbar.update(sum(len(ex.doc) for ex in batch))
+                pbar.update(sum(len(ex.predicted) for ex in batch))
                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
                nlp.update(
                    batch,
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@ -283,7 +283,7 @@ def initialize_pipeline(nlp, examples, config):
    nlp.parser.moves.add_action(2, "subtok")
    nlp.add_pipe(nlp.create_pipe("tagger"))
    for eg in examples:
-        for tag in eg.gold.tags:
+        for tag in eg.get_aligned("TAG", as_string=True):
            if tag is not None:
                nlp.tagger.add_label(tag)
    # Replace labels that didn't make the frequency cutoff
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -56,7 +56,7 @@ def main(model=None, output_dir=None, n_iter=100):
            print("Add label", ent[2])
            ner.add_label(ent[2])
-    with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
+    with nlp.select_pipes(enable="simple_ner") and warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module="spacy")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc==8.0.0a9",
+    "thinc==8.0.0a11",
    "blis>=0.4.0,<0.5.0"
 ]
 build-backend = "setuptools.build_meta"
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.0.dev10"
+__version__ = "3.0.0.dev12"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -15,6 +15,8 @@ from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_model import init_model  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project import project_clone, project_assets, project_run  # noqa: F401
 from .project import project_run_all  # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/_app.py
+++ b/spacy/cli/_app.py
@ -1,4 +1,3 @@
 from typing import Optional
 import typer
 from typer.main import get_command
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -102,9 +102,6 @@ def debug_data(
        corpus = Corpus(train_path, dev_path)
        try:
            train_dataset = list(corpus.train_dataset(nlp))
            train_dataset_unpreprocessed = list(
                corpus.train_dataset_without_preprocessing(nlp)
            )
        except ValueError as e:
            loading_train_error_message = f"Training data cannot be loaded: {e}"
        try:
@ -120,11 +117,9 @@ def debug_data(
    msg.good("Corpus is loadable")
    # Create all gold data here to avoid iterating over the train_dataset constantly
-    gold_train_data = _compile_gold(train_dataset, pipeline, nlp)
+    gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
-    gold_train_unpreprocessed_data = _compile_gold(
+    gold_train_unpreprocessed_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=False)
-        train_dataset_unpreprocessed, pipeline
+    gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
    )
    gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp)
    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
@ -497,7 +492,7 @@ def _load_file(file_path: Path, msg: Printer) -> None:
 def _compile_gold(
-    examples: Sequence[Example], pipeline: List[str], nlp: Language
+    examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
 ) -> Dict[str, Any]:
    data = {
        "ner": Counter(),
@ -517,9 +512,9 @@ def _compile_gold(
        "n_cats_multilabel": 0,
        "texts": set(),
    }
-    for example in examples:
+    for eg in examples:
-        gold = example.reference
+        gold = eg.reference
-        doc = example.predicted
+        doc = eg.predicted
        valid_words = [x for x in gold if x is not None]
        data["words"].update(valid_words)
        data["n_words"] += len(valid_words)
@ -530,7 +525,7 @@ def _compile_gold(
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
                    data["words_missing_vectors"].update([word])
        if "ner" in pipeline:
-            for i, label in enumerate(gold.ner):
+            for i, label in enumerate(eg.get_aligned_ner()):
                if label is None:
                    continue
                if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
@ -556,16 +551,18 @@ def _compile_gold(
            if list(gold.cats.values()).count(1.0) != 1:
                data["n_cats_multilabel"] += 1
        if "tagger" in pipeline:
-            data["tags"].update([x for x in gold.tags if x is not None])
+            tags = eg.get_aligned("TAG", as_string=True)
            data["tags"].update([x for x in tags if x is not None])
        if "parser" in pipeline:
-            data["deps"].update([x for x in gold.labels if x is not None])
+            aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
-            for i, (dep, head) in enumerate(zip(gold.labels, gold.heads)):
+            data["deps"].update([x for x in aligned_deps if x is not None])
            for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
                if head == i:
                    data["roots"].update([dep])
                    data["n_sents"] += 1
-            if nonproj.is_nonproj_tree(gold.heads):
+            if nonproj.is_nonproj_tree(aligned_heads):
                data["n_nonproj"] += 1
-            if nonproj.contains_cycle(gold.heads):
+            if nonproj.contains_cycle(aligned_heads):
                data["n_cycles"] += 1
    return data
@ -581,7 +578,7 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
    for eg in data:
        labels = [
            label.split("-")[1]
-            for label in eg.gold.ner
+            for label in eg.get_aligned_ner()
            if label not in ("O", "-", None)
        ]
        if label not in labels:
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -1,7 +1,9 @@
-from typing import Optional, List
+from typing import Optional, List, Dict
 from timeit import default_timer as timer
 from wasabi import Printer
 from pathlib import Path
 import re
 import srsly
 from ..gold import Corpus
 from ..tokens import Doc
@ -16,13 +18,12 @@ def evaluate_cli(
    # fmt: off
    model: str = Arg(..., help="Model name or path"),
    data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
    output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
    gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
-    return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"),
+    # fmt: on
        # fmt: on
 ):
    """
    Evaluate a model. To render a sample of parses in a HTML file, set an
@ -31,24 +32,24 @@ def evaluate_cli(
    evaluate(
        model,
        data_path,
        output=output,
        gpu_id=gpu_id,
        gold_preproc=gold_preproc,
        displacy_path=displacy_path,
        displacy_limit=displacy_limit,
        silent=False,
        return_scores=return_scores,
    )
 def evaluate(
    model: str,
    data_path: Path,
    output: Optional[Path],
    gpu_id: int = -1,
    gold_preproc: bool = False,
    displacy_path: Optional[Path] = None,
    displacy_limit: int = 25,
    silent: bool = True,
    return_scores: bool = False,
 ) -> Scorer:
    msg = Printer(no_print=silent, pretty=not silent)
    util.fix_random_seed()
@ -56,21 +57,19 @@ def evaluate(
        util.use_gpu(gpu_id)
    util.set_env_log(False)
    data_path = util.ensure_path(data_path)
    output_path = util.ensure_path(output)
    displacy_path = util.ensure_path(displacy_path)
    if not data_path.exists():
        msg.fail("Evaluation data not found", data_path, exits=1)
    if displacy_path and not displacy_path.exists():
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
    corpus = Corpus(data_path, data_path)
-    if model.startswith("blank:"):
+    nlp = util.load_model(model)
        nlp = util.get_lang_class(model.replace("blank:", ""))()
    else:
        nlp = util.load_model(model)
    dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
    begin = timer()
    scorer = nlp.evaluate(dev_dataset, verbose=False)
    end = timer()
-    nwords = sum(len(ex.doc) for ex in dev_dataset)
+    nwords = sum(len(ex.predicted) for ex in dev_dataset)
    results = {
        "Time": f"{end - begin:.2f} s",
        "Words": nwords,
@ -90,10 +89,22 @@ def evaluate(
        "Sent R": f"{scorer.sent_r:.2f}",
        "Sent F": f"{scorer.sent_f:.2f}",
    }
    data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
    msg.table(results, title="Results")
    if scorer.ents_per_type:
        data["ents_per_type"] = scorer.ents_per_type
        print_ents_per_type(msg, scorer.ents_per_type)
    if scorer.textcats_f_per_cat:
        data["textcats_f_per_cat"] = scorer.textcats_f_per_cat
        print_textcats_f_per_cat(msg, scorer.textcats_f_per_cat)
    if scorer.textcats_auc_per_cat:
        data["textcats_auc_per_cat"] = scorer.textcats_auc_per_cat
        print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
    if displacy_path:
-        docs = [ex.doc for ex in dev_dataset]
+        docs = [ex.predicted for ex in dev_dataset]
        render_deps = "parser" in nlp.meta.get("pipeline", [])
        render_ents = "ner" in nlp.meta.get("pipeline", [])
        render_parses(
@ -105,8 +116,11 @@ def evaluate(
            ents=render_ents,
        )
        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
-    if return_scores:
+
-        return scorer.scores
+    if output_path is not None:
        srsly.write_json(output_path, data)
        msg.good(f"Saved results to {output_path}")
    return data
 def render_parses(
@ -128,3 +142,40 @@ def render_parses(
        )
        with (output_path / "parses.html").open("w", encoding="utf8") as file_:
            file_.write(html)
 def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
    data = [
        (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
        for k, v in scores.items()
    ]
    msg.table(
        data,
        header=("", "P", "R", "F"),
        aligns=("l", "r", "r", "r"),
        title="NER (per type)",
    )
 def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
    data = [
        (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
        for k, v in scores.items()
    ]
    msg.table(
        data,
        header=("", "P", "R", "F"),
        aligns=("l", "r", "r", "r"),
        title="Textcat F (per type)",
    )
 def print_textcats_auc_per_cat(
    msg: Printer, scores: Dict[str, Dict[str, float]]
 ) -> None:
    msg.table(
        [(k, f"{v['roc_auc_score']:.2f}") for k, v in scores.items()],
        header=("", "ROC AUC"),
        aligns=("l", "r"),
        title="Textcat ROC AUC (per label)",
    )
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -16,8 +16,9 @@ def package_cli(
    # fmt: off
    input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
-    meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False),
+    meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
    create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
    version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
    force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
    # fmt: on
 ):
@ -32,6 +33,7 @@ def package_cli(
        input_dir,
        output_dir,
        meta_path=meta_path,
        version=version,
        create_meta=create_meta,
        force=force,
        silent=False,
@ -42,6 +44,7 @@ def package(
    input_dir: Path,
    output_dir: Path,
    meta_path: Optional[Path] = None,
    version: Optional[str] = None,
    create_meta: bool = False,
    force: bool = False,
    silent: bool = True,
@ -61,10 +64,13 @@ def package(
    if not meta_path.exists() or not meta_path.is_file():
        msg.fail("Can't load model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    meta = get_meta(input_dir, meta)
    if version is not None:
        meta["version"] = version
    if not create_meta:  # only print if user doesn't want to overwrite
        msg.good("Loaded meta.json from file", meta_path)
    else:
-        meta = generate_meta(input_dir, meta, msg)
+        meta = generate_meta(meta, msg)
    errors = validate(ModelMetaSchema, meta)
    if errors:
        msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
@ -101,20 +107,20 @@ def create_file(file_path: Path, contents: str) -> None:
    file_path.open("w", encoding="utf-8").write(contents)
-def generate_meta(
+def get_meta(
-    model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer
+    model_path: Union[str, Path], existing_meta: Dict[str, Any]
 ) -> Dict[str, Any]:
-    meta = existing_meta or {}
+    meta = {
-    settings = [
+        "lang": "en",
-        ("lang", "Model language", meta.get("lang", "en")),
+        "name": "model",
-        ("name", "Model name", meta.get("name", "model")),
+        "version": "0.0.0",
-        ("version", "Model version", meta.get("version", "0.0.0")),
+        "description": None,
-        ("description", "Model description", meta.get("description", False)),
+        "author": None,
-        ("author", "Author", meta.get("author", False)),
+        "email": None,
-        ("email", "Author email", meta.get("email", False)),
+        "url": None,
-        ("url", "Author website", meta.get("url", False)),
+        "license": "MIT",
-        ("license", "License", meta.get("license", "MIT")),
+    }
-    ]
+    meta.update(existing_meta)
    nlp = util.load_model_from_path(Path(model_path))
    meta["spacy_version"] = util.get_model_version_range(about.__version__)
    meta["pipeline"] = nlp.pipe_names
@ -124,6 +130,23 @@ def generate_meta(
        "keys": nlp.vocab.vectors.n_keys,
        "name": nlp.vocab.vectors.name,
    }
    if about.__title__ != "spacy":
        meta["parent_package"] = about.__title__
    return meta
 def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
    meta = existing_meta or {}
    settings = [
        ("lang", "Model language", meta.get("lang", "en")),
        ("name", "Model name", meta.get("name", "model")),
        ("version", "Model version", meta.get("version", "0.0.0")),
        ("description", "Model description", meta.get("description", None)),
        ("author", "Author", meta.get("author", None)),
        ("email", "Author email", meta.get("email", None)),
        ("url", "Author website", meta.get("url", None)),
        ("license", "License", meta.get("license", "MIT")),
    ]
    msg.divider("Generating meta.json")
    msg.text(
        "Enter the package settings for your model. The following information "
@ -132,8 +155,6 @@ def generate_meta(
    for setting, desc, default in settings:
        response = get_raw_input(desc, default)
        meta[setting] = default if response == "" and default else response
    if about.__title__ != "spacy":
        meta["parent_package"] = about.__title__
    return meta
@ -184,12 +205,12 @@ def setup_package():
    setup(
        name=model_name,
-        description=meta['description'],
+        description=meta.get('description'),
-        author=meta['author'],
+        author=meta.get('author'),
-        author_email=meta['email'],
+        author_email=meta.get('email'),
-        url=meta['url'],
+        url=meta.get('url'),
        version=meta['version'],
-        license=meta['license'],
+        license=meta.get('license'),
        packages=[model_name],
        package_data={model_name: list_files(model_dir)},
        install_requires=list_requirements(meta),
--- a/spacy/cli/project.py
+++ b/spacy/cli/project.py
@ -0,0 +1,679 @@
 from typing import List, Dict, Any, Optional, Sequence
 import typer
 import srsly
 from pathlib import Path
 from wasabi import msg
 import subprocess
 import shlex
 import os
 import re
 import shutil
 import sys
 import requests
 import tqdm
 from ._app import app, Arg, Opt, COMMAND, NAME
 from .. import about
 from ..schemas import ProjectConfigSchema, validate
 from ..util import ensure_path, run_command, make_tempdir, working_dir
 from ..util import get_hash, get_checksum
 CONFIG_FILE = "project.yml"
 DVC_CONFIG = "dvc.yaml"
 DIRS = [
    "assets",
    "metas",
    "configs",
    "packages",
    "metrics",
    "scripts",
    "notebooks",
    "training",
    "corpus",
 ]
 CACHES = [
    Path.home() / ".torch",
    Path.home() / ".caches" / "torch",
    os.environ.get("TORCH_HOME"),
    Path.home() / ".keras",
 ]
 DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
 # it directly and edit the project.yml instead and re-run the project."""
 CLI_HELP = f"""Command-line interface for spaCy projects and working with project
 templates. You'd typically start by cloning a project template to a local
 directory and fetching its assets like datasets etc. See the project's
 {CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
 Version Control) to manage input and output files and to ensure steps are only
 re-run if their inputs change.
 """
 project_cli = typer.Typer(help=CLI_HELP)
@project_cli.callback(invoke_without_command=True)
 def callback(ctx: typer.Context):
    """This runs before every project command and ensures DVC is installed."""
    ensure_dvc()
 ################
 # CLI COMMANDS #
 ################
@project_cli.command("clone")
 def project_clone_cli(
    # fmt: off
    name: str = Arg(..., help="The name of the template to fetch"),
    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
    no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
    # fmt: on
 ):
    """Clone a project template from a repository. Calls into "git" and will
    only download the files from the given subdirectory. The GitHub repo
    defaults to the official spaCy template repo, but can be customized
    (including using a private repo). Setting the --git flag will also
    initialize the project directory as a Git repo. If the project is intended
    to be a Git repo, it should be initialized with Git first, before
    initializing DVC (Data Version Control). This allows DVC to integrate with
    Git.
    """
    project_clone(name, dest, repo=repo, git=git, no_init=no_init)
@project_cli.command("init")
 def project_init_cli(
    path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
 ):
    """Initialize a project directory with DVC and optionally Git. This should
    typically be taken care of automatically when you run the "project clone"
    command, but you can also run it separately. If the project is intended to
    be a Git repo, it should be initialized with Git first, before initializing
    DVC. This allows DVC to integrate with Git.
    """
    project_init(path, git=git, silent=True)
@project_cli.command("assets")
 def project_assets_cli(
    # fmt: off
    project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
    # fmt: on
 ):
    """Use DVC (Data Version Control) to fetch project assets. Assets are
    defined in the "assets" section of the project config. If possible, DVC
    will try to track the files so you can pull changes from upstream. It will
    also try and store the checksum so the assets are versioned. If th file
    can't be tracked or checked, it will be downloaded without DVC. If a checksum
    is provided in the project config, the file is only downloaded if no local
    file with the same checksum exists.
    """
    project_assets(project_dir)
@project_cli.command(
    "run-all",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def project_run_all_cli(
    # fmt: off
    ctx: typer.Context,
    project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
    # fmt: on
 ):
    """Run all commands defined in the project. This command will use DVC and
    the defined outputs and dependencies in the project config to determine
    which steps need to be re-run and where to start. This means you're only
    re-generating data if the inputs have changed.
    This command calls into "dvc repro" and all additional arguments are passed
    to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
    """
    if show_help:
        print_run_help(project_dir)
    else:
        project_run_all(project_dir, *ctx.args)
@project_cli.command(
    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def project_run_cli(
    # fmt: off
    ctx: typer.Context,
    project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
    subcommand: str = Arg(None, help="Name of command defined in project config"),
    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
    # fmt: on
 ):
    """Run a named script defined in the project config. If the command is
    part of the default pipeline defined in the "run" section, DVC is used to
    determine whether the step should re-run if its inputs have changed, or
    whether everything is up to date. If the script is not part of the default
    pipeline, it will be called separately without DVC.
    If DVC is used, the command calls into "dvc repro" and all additional
    arguments are passed to the "dvc repro" command:
    https://dvc.org/doc/command-reference/repro
    """
    if show_help or not subcommand:
        print_run_help(project_dir, subcommand)
    else:
        project_run(project_dir, subcommand, *ctx.args)
@project_cli.command("exec", hidden=True)
 def project_exec_cli(
    # fmt: off
    project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
    subcommand: str = Arg(..., help="Name of command defined in project config"),
    # fmt: on
 ):
    """Execute a command defined in the project config. This CLI command is
    only called internally in auto-generated DVC pipelines, as a shortcut for
    multi-step commands in the project config. You typically shouldn't have to
    call it yourself. To run a command, call "run" or "run-all".
    """
    project_exec(project_dir, subcommand)
@project_cli.command("update-dvc")
 def project_update_dvc_cli(
    # fmt: off
    project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
    # fmt: on
 ):
    """Update the auto-generated DVC config file. Uses the steps defined in the
    "run" section of the project config. This typically happens automatically
    when running a command, but can also be triggered manually if needed.
    """
    config = load_project_config(project_dir)
    updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
    if updated:
        msg.good(f"Updated DVC config from {CONFIG_FILE}")
    else:
        msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
 app.add_typer(project_cli, name="project")
 #################
 # CLI FUNCTIONS #
 #################
 def project_clone(
    name: str,
    dest: Path,
    *,
    repo: str = about.__projects__,
    git: bool = False,
    no_init: bool = False,
 ) -> None:
    """Clone a project template from a repository.
    name (str): Name of subdirectory to clone.
    dest (Path): Destination path of cloned project.
    repo (str): URL of Git repo containing project templates.
    git (bool): Initialize project as Git repo. Should be set to True if project
        is intended as a repo, since it will allow DVC to integrate with Git.
    no_init (bool): Don't initialize DVC and Git automatically. If True, the
        "init" command or "git init" and "dvc init" need to be run manually.
    """
    dest = ensure_path(dest)
    check_clone(name, dest, repo)
    project_dir = dest.resolve()
    # We're using Git and sparse checkout to only clone the files we need
    with make_tempdir() as tmp_dir:
        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
        run_command(shlex.split(cmd))
        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
            f.write(name)
        run_command(["git", "-C", tmp_dir, "fetch"])
        run_command(["git", "-C", tmp_dir, "checkout"])
        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
    msg.good(f"Cloned project '{name}' from {repo}")
    for sub_dir in DIRS:
        dir_path = project_dir / sub_dir
        if not dir_path.exists():
            dir_path.mkdir(parents=True)
    if not no_init:
        project_init(project_dir, git=git, silent=True)
    msg.good(f"Your project is now ready!", dest)
    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
 def project_init(
    project_dir: Path,
    *,
    git: bool = False,
    silent: bool = False,
    analytics: bool = False,
 ):
    """Initialize a project as a DVC and (optionally) as a Git repo.
    project_dir (Path): Path to project directory.
    git (bool): Also call "git init" to initialize directory as a Git repo.
    silent (bool): Don't print any output (via DVC).
    analytics (bool): Opt-in to DVC analytics (defaults to False).
    """
    with working_dir(project_dir):
        init_cmd = ["dvc", "init"]
        if silent:
            init_cmd.append("--quiet")
        if not git:
            init_cmd.append("--no-scm")
        if git:
            run_command(["git", "init"])
        run_command(init_cmd)
        # We don't want to have analytics on by default – our users should
        # opt-in explicitly. If they want it, they can always enable it.
        if not analytics:
            run_command(["dvc", "config", "core.analytics", "false"])
        config = load_project_config(project_dir)
        setup_check_dvc(project_dir, config)
 def project_assets(project_dir: Path) -> None:
    """Fetch assets for a project using DVC if possible.
    project_dir (Path): Path to project directory.
    """
    project_path = ensure_path(project_dir)
    config = load_project_config(project_path)
    setup_check_dvc(project_path, config)
    assets = config.get("assets", {})
    if not assets:
        msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
    msg.info(f"Fetching {len(assets)} asset(s)")
    variables = config.get("variables", {})
    fetched_assets = []
    for asset in assets:
        url = asset["url"].format(**variables)
        dest = asset["dest"].format(**variables)
        fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
        if fetched_path:
            fetched_assets.append(str(fetched_path))
    if fetched_assets:
        with working_dir(project_path):
            run_command(["dvc", "add", *fetched_assets, "--external"])
 def fetch_asset(
    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
 ) -> Optional[Path]:
    """Fetch an asset from a given URL or path. Will try to import the file
    using DVC's import-url if possible (fully tracked and versioned) and falls
    back to get-url (versioned) and a non-DVC download if necessary. If a
    checksum is provided and a local file exists, it's only re-downloaded if the
    checksum doesn't match.
    project_path (Path): Path to project directory.
    url (str): URL or path to asset.
    checksum (Optional[str]): Optional expected checksum of local file.
    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
        the asset failed.
    """
    url = convert_asset_url(url)
    dest_path = (project_path / dest).resolve()
    if dest_path.exists() and checksum:
        # If there's already a file, check for checksum
        # TODO: add support for caches (dvc import-url with local path)
        if checksum == get_checksum(dest_path):
            msg.good(f"Skipping download with matching checksum: {dest}")
            return dest_path
    with working_dir(project_path):
        try:
            # If these fail, we don't want to output an error or info message.
            # Try with tracking the source first, then just downloading with
            # DVC, then a regular non-DVC download.
            try:
                dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
            except subprocess.CalledProcessError:
                dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
        except subprocess.CalledProcessError:
            try:
                download_file(url, dest_path)
            except requests.exceptions.HTTPError as e:
                msg.fail(f"Download failed: {dest}", e)
                return None
    if checksum and checksum != get_checksum(dest_path):
        msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
    msg.good(f"Fetched asset {dest}")
    return dest_path
 def project_run_all(project_dir: Path, *dvc_args) -> None:
    """Run all commands defined in the project using DVC.
    project_dir (Path): Path to project directory.
    *dvc_args: Other arguments passed to "dvc repro".
    """
    config = load_project_config(project_dir)
    setup_check_dvc(project_dir, config)
    dvc_cmd = ["dvc", "repro", *dvc_args]
    with working_dir(project_dir):
        run_command(dvc_cmd)
 def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
    """Simulate a CLI help prompt using the info available in the project config.
    project_dir (Path): The project directory.
    subcommand (Optional[str]): The subcommand or None. If a subcommand is
        provided, the subcommand help is shown. Otherwise, the top-level help
        and a list of available commands is printed.
    """
    config = load_project_config(project_dir)
    setup_check_dvc(project_dir, config)
    config_commands = config.get("commands", [])
    commands = {cmd["name"]: cmd for cmd in config_commands}
    if subcommand:
        validate_subcommand(commands.keys(), subcommand)
        print(f"Usage: {COMMAND} project run {project_dir} {subcommand}")
        help_text = commands[subcommand].get("help")
        if help_text:
            msg.text(f"\n{help_text}\n")
    else:
        print(f"\nAvailable commands in {CONFIG_FILE}")
        print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]")
        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
        msg.text("Run all commands defined in the 'run' block of the project config:")
        print(f"{COMMAND} project run-all {project_dir}")
 def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
    """Run a named script defined in the project config. If the script is part
    of the default pipeline (defined in the "run" section), DVC is used to
    execute the command, so it can determine whether to rerun it. It then
    calls into "exec" to execute it.
    project_dir (Path): Path to project directory.
    subcommand (str): Name of command to run.
    *dvc_args: Other arguments passed to "dvc repro".
    """
    config = load_project_config(project_dir)
    setup_check_dvc(project_dir, config)
    config_commands = config.get("commands", [])
    variables = config.get("variables", {})
    commands = {cmd["name"]: cmd for cmd in config_commands}
    validate_subcommand(commands.keys(), subcommand)
    if subcommand in config.get("run", []):
        # This is one of the pipeline commands tracked in DVC
        dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
        with working_dir(project_dir):
            run_command(dvc_cmd)
    else:
        cmd = commands[subcommand]
        # Deps in non-DVC commands aren't tracked, but if they're defined,
        # make sure they exist before running the command
        for dep in cmd.get("deps", []):
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                msg.fail(err, exits=1)
        with working_dir(project_dir):
            run_commands(cmd["script"], variables)
 def project_exec(project_dir: Path, subcommand: str):
    """Execute a command defined in the project config.
    project_dir (Path): Path to project directory.
    subcommand (str): Name of command to run.
    """
    config = load_project_config(project_dir)
    config_commands = config.get("commands", [])
    variables = config.get("variables", {})
    commands = {cmd["name"]: cmd for cmd in config_commands}
    with working_dir(project_dir):
        run_commands(commands[subcommand]["script"], variables)
 ###########
 # HELPERS #
 ###########
 def load_project_config(path: Path) -> Dict[str, Any]:
    """Load the project config file from a directory and validate it.
    path (Path): The path to the project directory.
    RETURNS (Dict[str, Any]): The loaded project config.
    """
    config_path = path / CONFIG_FILE
    if not config_path.exists():
        msg.fail("Can't find project config", config_path, exits=1)
    invalid_err = f"Invalid project config in {CONFIG_FILE}"
    try:
        config = srsly.read_yaml(config_path)
    except ValueError as e:
        msg.fail(invalid_err, e, exits=1)
    errors = validate(ProjectConfigSchema, config)
    if errors:
        msg.fail(invalid_err, "\n".join(errors), exits=1)
    return config
 def update_dvc_config(
    path: Path,
    config: Dict[str, Any],
    verbose: bool = False,
    silent: bool = False,
    force: bool = False,
 ) -> bool:
    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
    project directory. The file is auto-generated based on the config. The
    first line of the auto-generated file specifies the hash of the config
    dict, so if any of the config values change, the DVC config is regenerated.
    path (Path): The path to the project directory.
    config (Dict[str, Any]): The loaded project config.
    verbose (bool): Whether to print additional info (via DVC).
    silent (bool): Don't output anything (via DVC).
    force (bool): Force update, even if hashes match.
    RETURNS (bool): Whether the DVC config file was updated.
    """
    config_hash = get_hash(config)
    path = path.resolve()
    dvc_config_path = path / DVC_CONFIG
    if dvc_config_path.exists():
        # Cneck if the file was generated using the current config, if not, redo
        with dvc_config_path.open("r", encoding="utf8") as f:
            ref_hash = f.readline().strip().replace("# ", "")
        if ref_hash == config_hash and not force:
            return False  # Nothing has changed in project config, don't need to update
        dvc_config_path.unlink()
    variables = config.get("variables", {})
    commands = []
    # We only want to include commands that are part of the main list of "run"
    # commands in project.yml and should be run in sequence
    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    for name in config.get("run", []):
        validate_subcommand(config_commands.keys(), name)
        command = config_commands[name]
        deps = command.get("deps", [])
        outputs = command.get("outputs", [])
        outputs_no_cache = command.get("outputs_no_cache", [])
        if not deps and not outputs and not outputs_no_cache:
            continue
        # Default to "." as the project path since dvc.yaml is auto-generated
        # and we don't want arbitrary paths in there
        project_cmd = ["python", "-m", NAME, "project", "exec", ".", name]
        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
        dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
        if verbose:
            dvc_cmd.append("--verbose")
        if silent:
            dvc_cmd.append("--quiet")
        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
        commands.append(" ".join(full_cmd))
    with working_dir(path):
        run_commands(commands, variables, silent=True)
    with dvc_config_path.open("r+", encoding="utf8") as f:
        content = f.read()
        f.seek(0, 0)
        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
    return True
 def ensure_dvc() -> None:
    """Ensure that the "dvc" command is available and show an error if not."""
    try:
        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
    except Exception:
        msg.fail(
            "spaCy projects require DVC (Data Version Control) and the 'dvc' command",
            "You can install the Python package from pip (pip install dvc) or "
            "conda (conda install -c conda-forge dvc). For more details, see the "
            "documentation: https://dvc.org/doc/install",
            exits=1,
        )
 def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
    """Check that the project is set up correctly with DVC and update its
    config if needed. Will raise an error if the project is not an initialized
    DVC project.
    project_dir (Path): The path to the project directory.
    config (Dict[str, Any]): The loaded project config.
    """
    if not project_dir.exists():
        msg.fail(f"Can't find project directory: {project_dir}")
    if not (project_dir / ".dvc").exists():
        msg.fail(
            "Project not initialized as a DVC project.",
            f"Make sure that the project template was cloned correctly. To "
            f"initialize the project directory manually, you can run: "
            f"{COMMAND} project init {project_dir}",
            exits=1,
        )
    with msg.loading("Updating DVC config..."):
        updated = update_dvc_config(project_dir, config, silent=True)
    if updated:
        msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
 def run_commands(
    commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
 ) -> None:
    """Run a sequence of commands in a subprocess, in order.
    commands (List[str]): The split commands.
    variables (Dict[str, str]): Dictionary of variable names, mapped to their
        values. Will be used to substitute format string variables in the
        commands.
    silent (boll): Don't print the commands.
    """
    for command in commands:
        # Substitute variables, e.g. "./{NAME}.json"
        command = command.format(**variables)
        command = shlex.split(command)
        # TODO: is this needed / a good idea?
        if len(command) and command[0] == "python":
            command[0] = sys.executable
        elif len(command) and command[0] == "pip":
            command = [sys.executable, "-m", "pip", *command[1:]]
        if not silent:
            print(" ".join(command))
        run_command(command)
 def convert_asset_url(url: str) -> str:
    """Check and convert the asset URL if needed.
    url (str): The asset URL.
    RETURNS (str): The converted URL.
    """
    # If the asset URL is a regular GitHub URL it's likely a mistake
    if re.match("(http(s?)):\/\/github.com", url):
        converted = url.replace("github.com", "raw.githubusercontent.com")
        converted = re.sub(r"/(tree|blob)/", "/", converted)
        msg.warn(
            "Downloading from a regular GitHub URL. This will only download "
            "the source of the page, not the actual file. Converting the URL "
            "to a raw URL.",
            converted,
        )
        return converted
    return url
 def check_clone(name: str, dest: Path, repo: str) -> None:
    """Check and validate that the destination path can be used to clone. Will
    check that Git is available and that the destination path is suitable.
    name (str): Name of the directory to clone from the repo.
    dest (Path): Local destination of cloned directory.
    repo (str): URL of the repo to clone from.
    """
    try:
        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
    except Exception:
        msg.fail(
            f"Cloning spaCy project templates requires Git and the 'git' command. ",
            f"To clone a project without Git, copy the files from the '{name}' "
            f"directory in the {repo} to {dest} manually and then run:",
            f"{COMMAND} project init {dest}",
            exits=1,
        )
    if not dest:
        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
    if dest.exists():
        # Directory already exists (not allowed, clone needs to create it)
        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
    if not dest.parent.exists():
        # We're not creating parents, parent dir should exist
        msg.fail(
            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
            exits=1,
        )
 def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
    """Check that a subcommand is valid and defined. Raises an error otherwise.
    commands (Sequence[str]): The available commands.
    subcommand (str): The subcommand.
    """
    if subcommand not in commands:
        msg.fail(
            f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
            f"Available commands: {', '.join(commands)}",
            exits=1,
        )
 def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
    """Download a file using requests.
    url (str): The URL of the file.
    dest (Path): The destination path.
    chunk_size (int): The size of chunks to read/write.
    """
    response = requests.get(url, stream=True)
    response.raise_for_status()
    total = int(response.headers.get("content-length", 0))
    progress_settings = {
        "total": total,
        "unit": "iB",
        "unit_scale": True,
        "unit_divisor": chunk_size,
        "leave": False,
    }
    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
        for data in response.iter_content(chunk_size=chunk_size):
            size = f.write(data)
            bar.update(size)
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -132,6 +132,7 @@ class Warnings(object):
            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
    # TODO: fix numbering after merging develop into master
    W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
    W093 = ("Could not find any data to train the {name} on. Is your "
            "input data correctly formatted ?")
    W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
@ -154,7 +155,7 @@ class Warnings(object):
            "so a default configuration was used.")
    W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
            "but got '{type}' instead, so ignoring it.")
-    W100 = ("Skipping unsupported morphological feature(s): {feature}. "
+    W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
            "string \"Field1=Value1,Value2|Field2=Value3\".")
@ -182,18 +183,13 @@ class Errors(object):
            "`nlp.select_pipes()`, you should remove them explicitly with "
            "`nlp.remove_pipe()` before the pipeline is restored. Names of "
            "the new components: {names}")
    E009 = ("The `update` method expects same number of docs and golds, but "
            "got: {n_docs} docs, {n_golds} golds.")
    E010 = ("Word vectors set to length 0. This may be because you don't have "
            "a model installed or loaded, or because your model doesn't "
            "include word vectors. For more info, see the docs:\n"
            "https://spacy.io/usage/models")
    E011 = ("Unknown operator: '{op}'. Options: {opts}")
    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
    E013 = ("Error selecting action in matcher")
    E014 = ("Unknown tag ID: {tag}")
    E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
            "`force=True` to overwrite.")
    E016 = ("MultitaskObjective target should be function or one of: dep, "
            "tag, ent, dep_tag_offset, ent_tag.")
    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
@ -201,21 +197,8 @@ class Errors(object):
            "refers to an issue with the `Vocab` or `StringStore`.")
    E019 = ("Can't create transition with unknown action ID: {action}. Action "
            "IDs are enumerated in spacy/syntax/{src}.pyx.")
    E020 = ("Could not find a gold-standard action to supervise the "
            "dependency parser. The tree is non-projective (i.e. it has "
            "crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
            "The ArcEager transition system only supports projective trees. "
            "To learn non-projective representations, transform the data "
            "before training and after parsing. Either pass "
            "`make_projective=True` to the GoldParse class, or use "
            "spacy.syntax.nonproj.preprocess_training_data.")
    E021 = ("Could not find a gold-standard action to supervise the "
            "dependency parser. The GoldParse was projective. The transition "
            "system has {n_actions} actions. State at failure: {state}")
    E022 = ("Could not find a transition with the name '{name}' in the NER "
            "model.")
    E023 = ("Error cleaning up beam: The same state occurred twice at "
            "memory address {addr} and position {i}.")
    E024 = ("Could not find an optimal move to supervise the parser. Usually, "
            "this means that the model can't be updated in a way that's valid "
            "and satisfies the correct annotations specified in the GoldParse. "
@ -259,7 +242,6 @@ class Errors(object):
            "offset {start}.")
    E037 = ("Error calculating span: Can't find a token ending at character "
            "offset {end}.")
    E038 = ("Error finding sentence for span. Infinite loop detected.")
    E039 = ("Array bounds exceeded while searching for root word. This likely "
            "means the parse tree is in an invalid state. Please report this "
            "issue here: http://github.com/explosion/spaCy/issues")
@ -290,8 +272,6 @@ class Errors(object):
    E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
    E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
            "({rows}, {cols}).")
    E061 = ("Bad file name: {filename}. Example of a valid file name: "
            "'vectors.128.f.bin'")
    E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
            "and 63 are occupied. You can replace one by specifying the "
            "`flag_id` explicitly, e.g. "
@ -305,39 +285,17 @@ class Errors(object):
            "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
    E065 = ("Only one of the vector table's width and shape can be specified. "
            "Got width {width} and shape {shape}.")
    E066 = ("Error creating model helper for extracting columns. Can only "
            "extract columns by positive integer. Got: {value}.")
    E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
            "an entity) without a preceding 'B' (beginning of an entity). "
            "Tag sequence:\n{tags}")
    E068 = ("Invalid BILUO tag: '{tag}'.")
    E069 = ("Invalid gold-standard parse tree. Found cycle between word "
            "IDs: {cycle} (tokens: {cycle_tokens}) in the document starting "
            "with tokens: {doc_tokens}.")
    E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
            "does not align with number of annotations ({n_annots}).")
    E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
            "match the one in the vocab ({vocab_orth}).")
    E072 = ("Error serializing lexeme: expected data length {length}, "
            "got {bad_length}.")
    E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
            "are of length {length}. You can use `vocab.reset_vectors` to "
            "clear the existing vectors and resize the table.")
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
            "to end with the attribute {attr}. Got: {bad_attr}.")
    E075 = ("Error accepting match: length ({length}) > maximum length "
            "({max_len}).")
    E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
            "has {words} words.")
    E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
            "equal number of GoldParse objects ({n_golds}) in batch.")
    E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
            "not equal number of words in GoldParse ({words_gold}).")
    E079 = ("Error computing states in beam: number of predicted beams "
            "({pbeams}) does not equal number of gold beams ({gbeams}).")
    E080 = ("Duplicate state found in beam: {key}.")
    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
            "does not equal number of losses ({losses}).")
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
            "match.")
@ -345,8 +303,6 @@ class Errors(object):
            "`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
    E084 = ("Error assigning label ID {label} to span: not in StringStore.")
    E085 = ("Can't create lexeme for string '{string}'.")
    E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
            "not match hash {hash_id} in StringStore.")
    E087 = ("Unknown displaCy style: {style}.")
    E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
            "v2.x parser and NER models require roughly 1GB of temporary "
@ -388,7 +344,6 @@ class Errors(object):
    E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
            "token can only be part of one entity, so make sure the entities "
            "you're setting don't overlap.")
    E104 = ("Can't find JSON schema for '{name}'.")
    E105 = ("The Doc.print_tree() method is now deprecated. Please use "
            "Doc.to_json() instead or write your own function.")
    E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
@ -411,8 +366,6 @@ class Errors(object):
            "practically no advantage over pickling the parent Doc directly. "
            "So instead of pickling the span, pickle the Doc it belongs to or "
            "use Span.as_doc to convert the span to a standalone Doc object.")
    E113 = ("The newly split token can only have one root (head = 0).")
    E114 = ("The newly split token needs to have a root (head = 0).")
    E115 = ("All subtokens must have associated heads.")
    E116 = ("Cannot currently add labels to pretrained text classifier. Add "
            "labels before training begins. This functionality was available "
@ -435,12 +388,9 @@ class Errors(object):
            "equal to span length ({span_len}).")
    E122 = ("Cannot find token to be split. Did it get merged?")
    E123 = ("Cannot find head of token to be split. Did it get merged?")
    E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
    E125 = ("Unexpected value: {value}")
    E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
    E127 = ("Cannot create phrase pattern representation for length 0. This "
            "is likely a bug in spaCy.")
    E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
            "arguments to exclude fields from being serialized or deserialized "
            "is now deprecated. Please use the `exclude` argument instead. "
@ -482,8 +432,6 @@ class Errors(object):
            "provided {found}.")
    E143 = ("Labels for component '{name}' not initialized. Did you forget to "
            "call add_label()?")
    E144 = ("Could not find parameter `{param}` when building the entity "
            "linker model.")
    E145 = ("Error reading `{param}` from input file.")
    E146 = ("Could not access `{path}`.")
    E147 = ("Unexpected error in the {method} functionality of the "
@ -495,8 +443,6 @@ class Errors(object):
            "the component matches the model being loaded.")
    E150 = ("The language of the `nlp` object and the `vocab` should be the "
            "same, but found '{nlp}' and '{vocab}' respectively.")
    E151 = ("Trying to call nlp.update without required annotation types. "
            "Expected top-level keys: {exp}. Got: {unexp}.")
    E152 = ("The attribute {attr} is not supported for token patterns. "
            "Please use the option validate=True with Matcher, PhraseMatcher, "
            "or EntityRuler for more details.")
@ -533,11 +479,6 @@ class Errors(object):
            "that case.")
    E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
            "Current DocBin: {current}\nOther DocBin: {other}")
    E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
            "happen if the tagger was trained with a different set of "
            "morphological features. If you're using a pretrained model, make "
            "sure that your models are up to date:\npython -m spacy validate")
    E168 = ("Unknown field: {field}")
    E169 = ("Can't find module: {module}")
    E170 = ("Cannot apply transition {name}: invalid for the current state.")
    E171 = ("Matcher.add received invalid on_match callback argument: expected "
@ -548,8 +489,6 @@ class Errors(object):
    E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
            "Lookups containing the lemmatization tables. See the docs for "
            "details: https://spacy.io/api/lemmatizer#init")
    E174 = ("Architecture '{name}' not found in registry. Available "
            "names: {names}")
    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
    E177 = ("Ill-formed IOB input detected: {tag}")
@ -597,10 +536,19 @@ class Errors(object):
    E198 = ("Unable to return {n} most similar vectors for the current vectors "
            "table, which contains {n_rows} vectors.")
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
    E200 = ("Specifying a base model with a pretrained component '{component}' "
            "can not be combined with adding a pretrained Tok2Vec layer.")
    # TODO: fix numbering after merging develop into master
    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
            "array and {doc_length} for the Doc itself.")
    E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
    E973 = ("Unexpected type for NER data")
    E974 = ("Unknown {obj} attribute: {key}")
    E975 = ("The method Example.from_dict expects a Doc as first argument, "
            "but got {type}")
    E976 = ("The method Example.from_dict expects a dict as second argument, "
            "but received None.")
    E977 = ("Can not compare a MorphAnalysis with a string object. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
    E978 = ("The {method} method of component {name} takes a list of Example objects, "
            "but found {types} instead.")
    E979 = ("Cannot convert {type} to an Example object.")
@ -648,13 +596,8 @@ class Errors(object):
@add_codes
 class TempErrors(object):
    T003 = ("Resizing pretrained Tagger models is not currently supported.")
    T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
    T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
            "issue tracker: http://github.com/explosion/spaCy/issues")
    T008 = ("Bad configuration of Tagger. This is probably a bug within "
            "spaCy. We changed the name of an internal attribute for loading "
            "pretrained vectors, and the class has been passed the old name "
            "(pretrained_dims) but not the new name (pretrained_vectors).")
 # fmt: on
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@ -45,7 +45,7 @@ class Corpus:
    def make_examples(self, nlp, reference_docs, max_length=0):
        for reference in reference_docs:
-            if max_length >= 1 and len(reference) >= max_length:
+            if len(reference) >= max_length >= 1:
                if reference.is_sentenced:
                    for ref_sent in reference.sents:
                        yield Example(
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -2,7 +2,6 @@ import warnings
 import numpy
 from ..tokens import Token
 from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
 from ..tokens.span import Span
@ -11,9 +10,8 @@ from .align cimport Alignment
 from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
 from .iob_utils import spans_from_biluo_tags
 from .align import Alignment
-from ..errors import Errors, AlignmentError
+from ..errors import Errors, Warnings
 from ..syntax import nonproj
 from ..util import get_words_and_spaces
 cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
@ -32,11 +30,10 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
 cdef class Example:
    def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
        """ Doc can either be text, or an actual Doc """
        msg = "Example.__init__ got None for '{arg}'. Requires Doc."
        if predicted is None:
-            raise TypeError(msg.format(arg="predicted"))
+            raise TypeError(Errors.E972.format(arg="predicted"))
        if reference is None:
-            raise TypeError(msg.format(arg="reference"))
+            raise TypeError(Errors.E972.format(arg="reference"))
        self.x = predicted
        self.y = reference
        self._alignment = alignment
@ -47,7 +44,7 @@ cdef class Example:
        def __set__(self, doc):
            self.x = doc
-    
+
    property reference:
        def __get__(self):
            return self.y
@ -60,13 +57,13 @@ cdef class Example:
            self.x.copy(),
            self.y.copy()
        )
- 
+
    @classmethod
    def from_dict(cls, Doc predicted, dict example_dict):
        if example_dict is None:
-            raise ValueError("Example.from_dict expected dict, received None")
+            raise ValueError(Errors.E976)
        if not isinstance(predicted, Doc):
-            raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}")
+            raise TypeError(Errors.E975.format(type=type(predicted)))
        example_dict = _fix_legacy_dict_data(example_dict)
        tok_dict, doc_dict = _parse_example_dict_data(example_dict)
        if "ORTH" not in tok_dict:
@ -78,7 +75,7 @@ cdef class Example:
            predicted,
            annotations2doc(predicted.vocab, tok_dict, doc_dict)
        )
-    
+
    @property
    def alignment(self):
        if self._alignment is None:
@ -118,7 +115,8 @@ cdef class Example:
        aligned_deps = [None] * self.x.length
        heads = [token.head.i for token in self.y]
        deps = [token.dep_ for token in self.y]
-        heads, deps = nonproj.projectivize(heads, deps)
+        if projectivize:
            heads, deps = nonproj.projectivize(heads, deps)
        for cand_i in range(self.x.length):
            gold_i = cand_to_gold[cand_i]
            if gold_i is not None: # Alignment found
@ -151,7 +149,7 @@ cdef class Example:
                    x_text = self.x.text[end_char:]
                    x_text_offset = end_char
        x_tags = biluo_tags_from_offsets(
-            self.x, 
+            self.x,
            [(e.start_char, e.end_char, e.label_) for e in x_spans],
            missing=None
        )
@ -245,11 +243,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
            elif key == "cats":
                pass
            else:
-                raise ValueError(f"Unknown doc attribute: {key}")
+                raise ValueError(Errors.E974.format(obj="doc", key=key))
    for key, value in tok_annot.items():
        if key not in IDS:
-            raise ValueError(f"Unknown token attribute: {key}")
+            raise ValueError(Errors.E974.format(obj="token", key=key))
        elif key in ["ORTH", "SPACY"]:
            pass
        elif key == "HEAD":
@ -289,7 +287,7 @@ def _add_entities_to_doc(doc, ner_data):
        doc.ents = ner_data
        doc.ents = [span for span in ner_data if span.label_]
    else:
-        raise ValueError("Unexpected type for NER data")
+        raise ValueError(Errors.E973)
 def _parse_example_dict_data(example_dict):
@ -341,7 +339,7 @@ def _fix_legacy_dict_data(example_dict):
    if "HEAD" in token_dict and "SENT_START" in token_dict:
        # If heads are set, we don't also redundantly specify SENT_START.
        token_dict.pop("SENT_START")
-        warnings.warn("Ignoring annotations for sentence starts, as dependency heads are set")
+        warnings.warn(Warnings.W092)
    return {
        "token_annotation": token_dict,
        "doc_annotation": doc_dict
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@ -145,7 +145,7 @@ def json_to_annotations(doc):
        example["doc_annotation"] = dict(
            cats=cats,
            entities=ner_tags,
-            links=paragraph.get("links", [])   # TODO: fix/test
+            links=paragraph.get("links", [])
        )
        yield example
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -107,9 +107,9 @@ cdef class Morphology:
        Returns the hash of the new analysis.
        """
        cdef MorphAnalysisC* tag_ptr
        if features == self.EMPTY_MORPH:
            features = ""
        if isinstance(features, str):
            if features == self.EMPTY_MORPH:
                features = ""
            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
            if tag_ptr != NULL:
                return tag_ptr.key
--- a/spacy/pipeline/simple_ner.py
+++ b/spacy/pipeline/simple_ner.py
@ -70,7 +70,7 @@ class SimpleNER(Pipe):
    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
        if not any(_has_ner(eg) for eg in examples):
            return 0
-        docs = [eg.doc for eg in examples]
+        docs = [eg.predicted for eg in examples]
        set_dropout_rate(self.model, drop)
        scores, bp_scores = self.model.begin_update(docs)
        loss, d_scores = self.get_loss(examples, scores)
@ -89,7 +89,8 @@ class SimpleNER(Pipe):
        d_scores = []
        truths = []
        for eg in examples:
-            gold_tags = [(tag if tag != "-" else None) for tag in eg.gold.ner]
+            tags = eg.get_aligned("TAG", as_string=True)
            gold_tags = [(tag if tag != "-" else None) for tag in tags]
            if not self.is_biluo:
                gold_tags = biluo_to_iob(gold_tags)
            truths.append(gold_tags)
@ -128,8 +129,8 @@ class SimpleNER(Pipe):
        pass
-def _has_ner(eg):
+def _has_ner(example):
-    for ner_tag in eg.gold.ner:
+    for ner_tag in example.get_aligned_ner():
        if ner_tag != "-" and ner_tag is not None:
            return True
    else:
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -220,8 +220,11 @@ class TrainingSchema(BaseModel):
 class ProjectConfigAsset(BaseModel):
    # fmt: off
    dest: StrictStr = Field(..., title="Destination of downloaded asset")
    url: StrictStr = Field(..., title="URL of asset")
    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
    # fmt: on
 class ProjectConfigCommand(BaseModel):
@ -229,11 +232,15 @@ class ProjectConfigCommand(BaseModel):
    name: StrictStr = Field(..., title="Name of command")
    help: Optional[StrictStr] = Field(None, title="Command description")
    script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
-    dvc_deps: List[StrictStr] = Field([], title="Data Version Control dependencies")
+    deps: List[StrictStr] = Field([], title="Data Version Control dependencies")
-    dvc_outputs: List[StrictStr] = Field([], title="Data Version Control outputs")
+    outputs: List[StrictStr] = Field([], title="Data Version Control outputs")
-    dvc_outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)")
+    outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)")
    # fmt: on
    class Config:
        title = "A single named command specified in a project config"
        extra = "forbid"
 class ProjectConfigSchema(BaseModel):
    # fmt: off
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -230,14 +230,13 @@ def test_json2docs_no_ner(en_vocab):
        Doc(
            doc.vocab,
            words=[w.text for w in doc],
-            spaces=[bool(w.whitespace_) for w in doc]
+            spaces=[bool(w.whitespace_) for w in doc],
        ),
-        doc
+        doc,
    )
    ner_tags = eg.get_aligned_ner()
    assert ner_tags == [None, None, None, None, None]
 def test_split_sentences(en_vocab):
    words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
@ -283,8 +282,8 @@ def test_split_sentences(en_vocab):
    assert split_examples[1].text == "had loads of fun "
-def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
+@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
-    # one-to-many
+def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
    words = ["I", "flew to", "San Francisco Valley", "."]
    spaces = [True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
@ -292,9 +291,28 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "O", "U-LOC", "O"]
    entities = [
        (len("I "), len("I flew to"), "ORG"),
        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
    ]
    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "U-ORG", "U-LOC", "O"]
    entities = [
        (len("I "), len("I flew"), "ORG"),
        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
    ]
    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", None, "U-LOC", "O"]
-    
+
-    # many-to-one
+
 def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
@ -304,31 +322,38 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
-    # misaligned
+    entities = [
        (len("I "), len("I flew to"), "ORG"),
        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
    ]
    gold_words = ["I", "flew to", "San Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "B-ORG", "L-ORG", "B-LOC", "I-LOC", "L-LOC", "O"]
@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
 def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
    words = ["I flew", "to", "San Francisco", "Valley", "."]
    spaces = [True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    offset_start = len("I flew to ")
+    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
    offset_end = len("I flew to San Francisco Valley")
    entities = [(offset_start, offset_end, "LOC")]
    links = {(offset_start, offset_end): {"Q816843": 1.0}}
    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
-    example = Example.from_dict(
+    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
        doc, {"words": gold_words, "entities": entities, "links": links}
    )
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == [None, "O", "B-LOC", "L-LOC", "O"]
+    assert ner_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
    #assert example.get_aligned("ENT_KB_ID", as_string=True) == [
    #    "",
    #    "",
    #    "Q816843",
    #    "Q816843",
    #    "",
    #]
    #assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {
    #    "Q816843": 1.0
    #}
    entities = [
        (len("I "), len("I flew to"), "ORG"),
        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
    ]
    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == [None, None, "B-LOC", "L-LOC", "O"]
 def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
    # additional whitespace tokens in GoldParse words
    words, spaces = get_words_and_spaces(
        ["I", "flew", "to", "San Francisco", "Valley", "."],
@ -344,7 +369,8 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
-    # from issue #4791
+
 def test_gold_biluo_4791(en_vocab, en_tokenizer):
    doc = en_tokenizer("I'll return the ₹54 amount")
    gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
    gold_spaces = [False, True, True, True, False, True, False]
@ -593,7 +619,6 @@ def test_tuple_format_implicit_invalid():
        _train(train_data)
 def _train(train_data):
    nlp = English()
    ner = nlp.create_pipe("ner")
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -1,15 +1,14 @@
 import numpy
 import tempfile
 import shutil
 import contextlib
 import srsly
 from pathlib import Path
 from spacy import Errors
 from spacy.tokens import Doc, Span
 from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
 from spacy.vocab import Vocab
 from spacy.util import make_tempdir  # noqa: F401
@contextlib.contextmanager
@ -19,13 +18,6 @@ def make_tempfile(mode="r"):
    f.close()
@contextlib.contextmanager
 def make_tempdir():
    d = Path(tempfile.mkdtemp())
    yield d
    shutil.rmtree(str(d))
 def get_doc(
    vocab,
    words=[],
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
 from ..errors import Errors
-ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "LEMMA", "MORPH")
+ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
 class DocBin(object):
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -816,7 +816,7 @@ cdef class Doc:
        cdef TokenC* tokens = self.c
        cdef int length = len(array)
        if length != len(self):
-            raise ValueError("Cannot set array values longer than the document.")
+            raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
        # Get set up for fast loading
        cdef Pool mem = Pool()
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@ -1,6 +1,7 @@
 from libc.string cimport memset
 cimport numpy as np
 from ..errors import Errors
 from ..vocab cimport Vocab
 from ..typedefs cimport hash_t, attr_t
 from ..morphology cimport list_features, check_feature, get_by_field
@ -49,6 +50,8 @@ cdef class MorphAnalysis:
        return self.key
    def __eq__(self, other):
        if isinstance(other, str):
            raise ValueError(Errors.E977)
        return self.key == other.key
    def __ne__(self, other):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -19,6 +19,9 @@ from packaging.specifiers import SpecifierSet, InvalidSpecifier
 from packaging.version import Version, InvalidVersion
 import subprocess
 from contextlib import contextmanager
 import tempfile
 import shutil
 import hashlib
 try:
@ -455,6 +458,37 @@ def working_dir(path: Union[str, Path]) -> None:
        os.chdir(prev_cwd)
@contextmanager
 def make_tempdir():
    """Execute a block in a temporary directory and remove the directory and
    its contents at the end of the with block.
    YIELDS (Path): The path of the temp directory.
    """
    d = Path(tempfile.mkdtemp())
    yield d
    shutil.rmtree(str(d))
 def get_hash(data) -> str:
    """Get the hash for a JSON-serializable object.
    data: The data to hash.
    RETURNS (str): The hash.
    """
    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
    return hashlib.md5(data_str).hexdigest()
 def get_checksum(path: Union[Path, str]) -> str:
    """Get the checksum for a file given its file path.
    path (Union[Path, str]): The file path.
    RETURNS (str): The checksum.
    """
    return hashlib.md5(Path(path).read_bytes()).hexdigest()
 def is_in_jupyter():
    """Check if user is running spaCy from a Jupyter notebook by detecting the
    IPython kernel. Mainly used for the displaCy visualizer.