Merge remote-tracking branch 'upstream/master' into chore/v4-merge-master-20221222

2025-11-04 01:48:04 +03:00 · 2022-12-22 10:08:54 +01:00 · 2022-12-22 10:08:54 +01:00 · 207565a788
commit 207565a788
parent f9308aae13 eef3d950b4
31 changed files with 442 additions and 92 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -107,7 +107,7 @@ steps:
    displayName: "Run CPU tests"

  - script: |
-      python -m pip install --pre thinc-apple-ops
+      python -m pip install 'spacy[apple]'
      python -m pytest --pyargs spacy
    displayName: "Run CPU tests with thinc-apple-ops"
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@ -15,7 +15,7 @@ jobs:
  action:
    runs-on: ubuntu-latest
    steps:
-      - uses: dessant/lock-threads@v3
+      - uses: dessant/lock-threads@v4
        with:
          process-only: 'issues'
          issue-inactive-days: '30'
--- a/README.md
+++ b/README.md
@ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more,
 multi-task learning with pretrained **transformers** like BERT, as well as a
 production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
-open-source software, released under the MIT license.
+open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).

 💫 **Version 3.4 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
 | 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                 |
 | 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                        |
 | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
+| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |

 [spacy 101]: https://spacy.io/usage/spacy-101
 [new in v3.0]: https://spacy.io/usage/v3
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
 [changelog]: https://spacy.io/usage#changelog
 [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md

+
 ## 💬 Where to ask questions

 The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
--- a/build-constraints.txt
+++ b/build-constraints.txt
@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy==1.19.3; python_version=='3.9'
 numpy==1.21.3; python_version=='3.10'
-numpy; python_version>='3.11'
+numpy==1.23.2; python_version=='3.11'
+numpy; python_version>='3.12'
--- a/requirements.txt
+++ b/requirements.txt
@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0
 thinc>=9.0.0.dev0,<9.1.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.9.1,<1.1.0
+wasabi>=0.9.1,<1.2.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.8.0
--- a/setup.cfg
+++ b/setup.cfg
@ -39,7 +39,7 @@ install_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    thinc>=9.0.0.dev0,<9.1.0
-    wasabi>=0.9.1,<1.1.0
+    wasabi>=0.9.1,<1.2.0
    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
    # Third-party dependencies
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -16,6 +16,7 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .debug_diff import debug_diff  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
+from .apply import apply  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -158,15 +158,15 @@ def load_project_config(
        sys.exit(1)
    validate_project_version(config)
    validate_project_commands(config)
+    if interpolate:
+        err = f"{PROJECT_FILE} validation error"
+        with show_validation_error(title=err, hint_fill=False):
+            config = substitute_project_variables(config, overrides)
    # Make sure directories defined in config exist
    for subdir in config.get("directories", []):
        dir_path = path / subdir
        if not dir_path.exists():
            dir_path.mkdir(parents=True)
-    if interpolate:
-        err = f"{PROJECT_FILE} validation error"
-        with show_validation_error(title=err, hint_fill=False):
-            config = substitute_project_variables(config, overrides)
    return config


@ -582,6 +582,29 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
            local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")


+def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
+    if not path.is_dir():
+        return [path]
+    paths = [path]
+    locs = []
+    seen = set()
+    for path in paths:
+        if str(path) in seen:
+            continue
+        seen.add(str(path))
+        if path.parts[-1].startswith("."):
+            continue
+        elif path.is_dir():
+            paths.extend(path.iterdir())
+        elif suffix is not None and not path.parts[-1].endswith(suffix):
+            continue
+        else:
+            locs.append(path)
+    # It's good to sort these, in case the ordering messes up cache.
+    locs.sort()
+    return locs
+
+
 def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
    """Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
    as happens with `round(number, ndigits)`"""
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@ -0,0 +1,143 @@
+import tqdm
+import srsly
+
+from itertools import chain
+from pathlib import Path
+from typing import Optional, List, Iterable, cast, Union
+
+from wasabi import msg
+
+from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
+
+from ..tokens import Doc, DocBin
+from ..vocab import Vocab
+from ..util import ensure_path, load_model
+
+
+path_help = """Location of the documents to predict on.
+Can be a single file in .spacy format or a .jsonl file.
+Files with other extensions are treated as single plain text documents.
+If a directory is provided it is traversed recursively to grab
+all files to be processed.
+The files can be a mixture of .spacy, .jsonl and text files.
+If .jsonl is provided the specified field is going
+to be grabbed ("text" by default)."""
+
+out_help = "Path to save the resulting .spacy file"
+code_help = (
+    "Path to Python file with additional " "code (registered functions) to be imported"
+)
+gold_help = "Use gold preprocessing provided in the .spacy files"
+force_msg = (
+    "The provided output file already exists. "
+    "To force overwriting the output file, set the --force or -F flag."
+)
+
+
+DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
+
+
+def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
+    """
+    Stream Doc objects from DocBin.
+    """
+    docbin = DocBin().from_disk(path)
+    for doc in docbin.get_docs(vocab):
+        yield doc
+
+
+def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
+    """
+    Stream "text" field from JSONL. If the field "text" is
+    not found it raises error.
+    """
+    for entry in srsly.read_jsonl(path):
+        if field not in entry:
+            msg.fail(
+                f"{path} does not contain the required '{field}' field.", exits=1
+            )
+        else:
+            yield entry[field]
+
+
+def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
+    """
+    Yields strings from text files in paths.
+    """
+    for path in paths:
+        with open(path, "r") as fin:
+            text = fin.read()
+            yield text
+
+
+@app.command("apply")
+def apply_cli(
+    # fmt: off
+    model: str = Arg(..., help="Model name or path"),
+    data_path: Path = Arg(..., help=path_help, exists=True),
+    output_file: Path = Arg(..., help=out_help, dir_okay=False),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
+    text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
+    force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
+    batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
+    n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
+):
+    """
+    Apply a trained pipeline to documents to get predictions.
+    Expects a loadable spaCy pipeline and path to the data, which
+    can be a directory or a file.
+    The data files can be provided in multiple formats:
+        1. .spacy files
+        2. .jsonl files with a specified "field" to read the text from.
+        3. Files with any other extension are assumed to be containing
+           a single document.
+    DOCS: https://spacy.io/api/cli#apply
+    """
+    data_path = ensure_path(data_path)
+    output_file = ensure_path(output_file)
+    code_path = ensure_path(code_path)
+    if output_file.exists() and not force_overwrite:
+        msg.fail(force_msg, exits=1)
+    if not data_path.exists():
+        msg.fail(f"Couldn't find data path: {data_path}", exits=1)
+    import_code(code_path)
+    setup_gpu(use_gpu)
+    apply(data_path, output_file, model, text_key, batch_size, n_process)
+
+
+def apply(
+    data_path: Path,
+    output_file: Path,
+    model: str,
+    json_field: str,
+    batch_size: int,
+    n_process: int,
+):
+    docbin = DocBin(store_user_data=True)
+    paths = walk_directory(data_path)
+    if len(paths) == 0:
+        docbin.to_disk(output_file)
+        msg.warn("Did not find data to process,"
+                 f" {data_path} seems to be an empty directory.")
+        return
+    nlp = load_model(model)
+    msg.good(f"Loaded model {model}")
+    vocab = nlp.vocab
+    streams: List[DocOrStrStream] = []
+    text_files = []
+    for path in paths:
+        if path.suffix == ".spacy":
+            streams.append(_stream_docbin(path, vocab))
+        elif path.suffix == ".jsonl":
+            streams.append(_stream_jsonl(path, json_field))
+        else:
+            text_files.append(path)
+    if len(text_files) > 0:
+        streams.append(_stream_texts(text_files))
+    datagen = cast(DocOrStrStream, chain(*streams))
+    for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
+        docbin.add(doc)
+    if output_file.suffix == "":
+        output_file = output_file.with_suffix(".spacy")
+    docbin.to_disk(output_file)
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,4 +1,4 @@
-from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
+from typing import Callable, Iterable, Mapping, Optional, Any, Union
 from enum import Enum
 from pathlib import Path
 from wasabi import Printer
@ -7,7 +7,7 @@ import re
 import sys
 import itertools

-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, walk_directory
 from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
    return None


-def walk_directory(path: Path, converter: str) -> List[Path]:
-    if not path.is_dir():
-        return [path]
-    paths = [path]
-    locs = []
-    seen = set()
-    for path in paths:
-        if str(path) in seen:
-            continue
-        seen.add(str(path))
-        if path.parts[-1].startswith("."):
-            continue
-        elif path.is_dir():
-            paths.extend(path.iterdir())
-        elif converter == "json" and not path.parts[-1].endswith("json"):
-            continue
-        elif converter == "conll" and not path.parts[-1].endswith("conll"):
-            continue
-        elif converter == "iob" and not path.parts[-1].endswith("iob"):
-            continue
-        else:
-            locs.append(path)
-    # It's good to sort these, in case the ordering messes up cache.
-    locs.sort()
-    return locs
-
-
 def verify_cli_args(
    msg: Printer,
    input_path: Path,
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -101,8 +101,8 @@ def project_run(
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
-                err_kwargs = {"exits": 1} if not dry else {}
-                msg.fail(err, err_help, **err_kwargs)
+                err_exits = 1 if not dry else None
+                msg.fail(err, err_help, exits=err_exits)
        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
        with working_dir(project_dir) as current_dir:
            msg.divider(subcommand)
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -336,6 +336,11 @@ class Errors(metaclass=ErrorsWithCodes):
            "clear the existing vectors and resize the table.")
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
            "to end with the attribute {attr}. Got: {bad_attr}.")
+    E079 = ("Error computing states in beam: number of predicted beams "
+            "({pbeams}) does not equal number of gold beams ({gbeams}).")
+    E080 = ("Duplicate state found in beam: {key}.")
+    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
+            "does not equal number of losses ({losses}).")
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
            "match.")
--- a/spacy/lang/nl/stop_words.py
+++ b/spacy/lang/nl/stop_words.py
@ -15,7 +15,7 @@

 STOP_WORDS = set(
    """
-aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
+aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
 afgelopen aldus alhoewel anderzijds

 ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -350,9 +350,9 @@ class EditTreeLemmatizer(TrainablePipe):

            tree = dict(tree)
            if "orig" in tree:
-                tree["orig"] = self.vocab.strings[tree["orig"]]
+                tree["orig"] = self.vocab.strings.add(tree["orig"])
            if "orig" in tree:
-                tree["subst"] = self.vocab.strings[tree["subst"]]
+                tree["subst"] = self.vocab.strings.add(tree["subst"])

            trees.append(tree)

--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):

    # head before start
    arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.int32(-1).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)

    # head after end
    arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.int32(5).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
--- a/spacy/tests/doc/test_span_group.py
+++ b/spacy/tests/doc/test_span_group.py
@ -1,7 +1,10 @@
+from typing import List
+
 import pytest
 from random import Random
 from spacy.matcher import Matcher
-from spacy.tokens import Span, SpanGroup
+from spacy.tokens import Span, SpanGroup, Doc
+from spacy.util import filter_spans


@pytest.fixture
@ -242,3 +245,13 @@ def test_span_group_extend(doc):
 def test_span_group_dealloc(span_group):
    with pytest.raises(AttributeError):
        print(span_group.doc)
+
+
+@pytest.mark.issue(11975)
+def test_span_group_typing(doc: Doc):
+    """Tests whether typing of `SpanGroup` as `Iterable[Span]`-like object is accepted by mypy."""
+    span_group: SpanGroup = doc.spans["SPANS"]
+    spans: List[Span] = list(span_group)
+    for i, span in enumerate(span_group):
+        assert span == span_group[i] == spans[i]
+    filter_spans(span_group)
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@ -62,10 +62,45 @@ def test_initialize_from_labels():
    nlp2 = Language()
    lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
    lemmatizer2.initialize(
-        get_examples=lambda: train_examples,
+        # We want to check that the strings in replacement nodes are
+        # added to the string store. Avoid that they get added through
+        # the examples.
+        get_examples=lambda: train_examples[:1],
        labels=lemmatizer.label_data,
    )
    assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
+    assert lemmatizer2.label_data == {
+        "trees": [
+            {"orig": "S", "subst": "s"},
+            {
+                "prefix_len": 1,
+                "suffix_len": 0,
+                "prefix_tree": 0,
+                "suffix_tree": 4294967295,
+            },
+            {"orig": "s", "subst": ""},
+            {
+                "prefix_len": 0,
+                "suffix_len": 1,
+                "prefix_tree": 4294967295,
+                "suffix_tree": 2,
+            },
+            {
+                "prefix_len": 0,
+                "suffix_len": 0,
+                "prefix_tree": 4294967295,
+                "suffix_tree": 4294967295,
+            },
+            {"orig": "E", "subst": "e"},
+            {
+                "prefix_len": 1,
+                "suffix_len": 0,
+                "prefix_tree": 5,
+                "suffix_tree": 4294967295,
+            },
+        ],
+        "labels": (1, 3, 4, 6),
+    }


 def test_no_data():
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -5,6 +5,7 @@ from typing import Tuple, List, Dict, Any
 import pkg_resources
 import time

+import spacy
 import numpy
 import pytest
 import srsly
@ -32,6 +33,7 @@ from spacy.cli.package import _is_permitted_package_name
 from spacy.cli.project.remote_storage import RemoteStorage
 from spacy.cli.project.run import _check_requirements
 from spacy.cli.validate import get_model_pkgs
+from spacy.cli.apply import apply
 from spacy.cli.find_threshold import find_threshold
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
@ -123,6 +125,25 @@ def test_issue7055():
    assert "model" in filled_cfg["components"]["ner"]


+@pytest.mark.issue(11235)
+def test_issue11235():
+    """
+    Test that the cli handles interpolation in the directory names correctly when loading project config.
+    """
+    lang_var = "en"
+    variables = {"lang": lang_var}
+    commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
+    directories = ["cfg", "${vars.lang}_model"]
+    project = {"commands": commands, "vars": variables, "directories": directories}
+    with make_tempdir() as d:
+        srsly.write_yaml(d / "project.yml", project)
+        cfg = load_project_config(d)
+        # Check that the directories are interpolated and created correctly
+        assert os.path.exists(d / "cfg")
+        assert os.path.exists(d / f"{lang_var}_model")
+    assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
+
+
 def test_cli_info():
    nlp = Dutch()
    nlp.add_pipe("textcat")
@ -866,6 +887,82 @@ def test_span_length_freq_dist_output_must_be_correct():
    assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]


+def test_applycli_empty_dir():
+    with make_tempdir() as data_path:
+        output = data_path / "test.spacy"
+        apply(data_path, output, "blank:en", "text", 1, 1)
+
+
+def test_applycli_docbin():
+    with make_tempdir() as data_path:
+        output = data_path / "testout.spacy"
+        nlp = spacy.blank("en")
+        doc = nlp("testing apply cli.")
+        # test empty DocBin case
+        docbin = DocBin()
+        docbin.to_disk(data_path / "testin.spacy")
+        apply(data_path, output, "blank:en", "text", 1, 1)
+        docbin.add(doc)
+        docbin.to_disk(data_path / "testin.spacy")
+        apply(data_path, output, "blank:en", "text", 1, 1)
+
+
+def test_applycli_jsonl():
+    with make_tempdir() as data_path:
+        output = data_path / "testout.spacy"
+        data = [{"field": "Testing apply cli.", "key": 234}]
+        data2 = [{"field": "234"}]
+        srsly.write_jsonl(data_path / "test.jsonl", data)
+        apply(data_path, output, "blank:en", "field", 1, 1)
+        srsly.write_jsonl(data_path / "test2.jsonl", data2)
+        apply(data_path, output, "blank:en", "field", 1, 1)
+
+
+def test_applycli_txt():
+    with make_tempdir() as data_path:
+        output = data_path / "testout.spacy"
+        with open(data_path / "test.foo", "w") as ftest:
+            ftest.write("Testing apply cli.")
+        apply(data_path, output, "blank:en", "text", 1, 1)
+
+
+def test_applycli_mixed():
+    with make_tempdir() as data_path:
+        output = data_path / "testout.spacy"
+        text = "Testing apply cli"
+        nlp = spacy.blank("en")
+        doc = nlp(text)
+        jsonl_data = [{"text": text}]
+        srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
+        docbin = DocBin()
+        docbin.add(doc)
+        docbin.to_disk(data_path / "testin.spacy")
+        with open(data_path / "test.txt", "w") as ftest:
+            ftest.write(text)
+        apply(data_path, output, "blank:en", "text", 1, 1)
+        # Check whether it worked
+        result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
+        assert len(result) == 3
+        for doc in result:
+            assert doc.text == text
+
+
+def test_applycli_user_data():
+    Doc.set_extension("ext", default=0)
+    val = ("ext", 0)
+    with make_tempdir() as data_path:
+        output = data_path / "testout.spacy"
+        nlp = spacy.blank("en")
+        doc = nlp("testing apply cli.")
+        doc._.ext = val
+        docbin = DocBin(store_user_data=True)
+        docbin.add(doc)
+        docbin.to_disk(data_path / "testin.spacy")
+        apply(data_path, output, "blank:en", "", 1, 1)
+        result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
+        assert result[0]._.ext == val
+
+
 def test_local_remote_storage():
    with make_tempdir() as d:
        filename = "a.txt"
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -359,6 +359,7 @@ cdef class Doc:
            for annot in annotations:
                if annot:
                    if annot is heads or annot is sent_starts or annot is ent_iobs:
+                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                        for i in range(len(words)):
                            if attrs.ndim == 1:
                                attrs[i] = annot[i]
@ -1573,6 +1574,7 @@ cdef class Doc:

            for j, (attr, annot) in enumerate(token_annotations.items()):
                if attr is HEAD:
+                    annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                    for i in range(len(words)):
                        array[i, j] = annot[i]
                elif attr is MORPH:
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@ -93,8 +93,8 @@ class Span:
        self,
        start_idx: int,
        end_idx: int,
-        label: int = ...,
-        kb_id: int = ...,
+        label: Union[int, str] = ...,
+        kb_id: Union[int, str] = ...,
        vector: Optional[Floats1d] = ...,
    ) -> Span: ...
    @property
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -318,7 +318,7 @@ cdef class Span:
                    for ancestor in ancestors:
                        ancestor_i = ancestor.i - span_c.start
                        if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)

                # if there is no appropriate ancestor, define a new artificial root
                value = array[i, head_col]
@ -326,7 +326,7 @@ cdef class Span:
                    new_root = old_to_new_root.get(ancestor_i, None)
                    if new_root is not None:
                        # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
                    else:
                        # set this token as the new artificial root
                        array[i, head_col] = 0
--- a/spacy/tokens/span_group.pyi
+++ b/spacy/tokens/span_group.pyi
@ -18,6 +18,7 @@ class SpanGroup:
    def doc(self) -> Doc: ...
    @property
    def has_overlap(self) -> bool: ...
+    def __iter__(self): ...
    def __len__(self) -> int: ...
    def append(self, span: Span) -> None: ...
    def extend(self, spans: Iterable[Span]) -> None: ...
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@ -159,6 +159,16 @@ cdef class SpanGroup:
            return self._concat(other)
        return NotImplemented

+    def __iter__(self):
+        """
+        Iterate over the spans in this SpanGroup.
+        YIELDS (Span): A span in this SpanGroup.
+
+        DOCS: https://spacy.io/api/spangroup#iter
+        """
+        for i in range(self.c.size()):
+            yield self[i]
+
    def append(self, Span span):
        """Add a span to the group. The span must refer to the same Doc
        object as the span group.
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
        if key not in IDS:
            raise ValueError(Errors.E974.format(obj="token", key=key))
        elif key in ["ORTH", "SPACY"]:
-            pass
+            continue
        elif key == "HEAD":
            attrs.append(key)
-            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
        elif key == "DEP":
            attrs.append(key)
-            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
        elif key == "SENT_START":
            attrs.append(key)
-            values.append([to_ternary_int(v) for v in value])
+            row = [to_ternary_int(v) for v in value]
        elif key == "MORPH":
            attrs.append(key)
-            values.append([vocab.morphology.add(v) for v in value])
+            row = [vocab.morphology.add(v) for v in value]
        else:
            attrs.append(key)
            if not all(isinstance(v, str) for v in value):
                types = set([type(v) for v in value])
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
-            values.append([vocab.strings.add(v) for v in value])
-    array = numpy.asarray(values, dtype="uint64")
+            row = [vocab.strings.add(v) for v in value]
+        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
+    array = numpy.array(values, dtype=numpy.uint64)
    return attrs, array.T


--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -12,6 +12,7 @@ menu:
  - ['train', 'train']
  - ['pretrain', 'pretrain']
  - ['evaluate', 'evaluate']
+  - ['apply', 'apply']
  - ['find-threshold', 'find-threshold']
  - ['assemble', 'assemble']
  - ['package', 'package']
@ -1162,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
 | `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**                               | Training results and optional metrics and visualizations.                                                                                                                            |

+## apply {#apply new="3.5" tag="command"}
+
+Applies a trained pipeline to data and stores the resulting annotated documents
+in a `DocBin`. The input can be a single file or a directory. The recognized
+input formats are:
+
+1. `.spacy`
+2. `.jsonl` containing a user specified `text_key`
+3. Files with any other extension are assumed to be plain text files containing
+   a single document.
+
+When a directory is provided it is traversed recursively to collect all files.
+
+```cli
+$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
+```
+
+| Name                                      | Description                                                                                                                                                                          |
+| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
+| `data_path`                               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
+| `output-file`, `-o`                       | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
+| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--text-key`, `-tk`                       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
+| `--force-overwrite`, `-F`                 | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
+| `--gpu-id`, `-g`                          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--batch-size`, `-b`                      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
+| `--n-process`, `-n`                       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
+| `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| **CREATES**                               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
+
 ## find-threshold {#find-threshold new="3.5" tag="command"}

 Runs prediction trials for a trained model with varying tresholds to maximize
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation.
 | `prefix`         | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~                                                                                                                                                                                            |
 | `prefix_`        | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~                                                                                                                                                                                            |
 | `suffix`         | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~                                                                                                                                                                                              |
-| `suffix_`        | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~                                                                                                                                                                                            |
+| `suffix_`        | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~                                                                                                                                                                                            |
 | `is_alpha`       | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~                                                                                                                                                                    |
 | `is_ascii`       | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~                                                                                                                                                     |
 | `is_digit`       | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~                                                                                                                                                                                   |
--- a/website/docs/api/spangroup.md
+++ b/website/docs/api/spangroup.md
@ -202,6 +202,23 @@ already present in the current span group.
 | `other`     | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
 | **RETURNS** | The span group. ~~SpanGroup~~                                           |

+## SpanGroup.\_\_iter\_\_ {#iter tag="method" new="3.5"}
+
+Iterate over the spans in this span group.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
+> for error_span in doc.spans["errors"]:
+>     print(error_span)
+> ```
+
+| Name       | Description                         |
+| ---------- | ----------------------------------- |
+| **YIELDS** | A span in this span group. ~~Span~~ |
+
 ## SpanGroup.append {#append tag="method"}

 Add a [`Span`](/api/span) object to the group. The span must refer to the same
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@ -45,7 +45,7 @@
                    { "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
                    {
                        "text": "Custom Solutions",
-                        "url": "https://explosion.ai/spacy-tailored-pipelines"
+                        "url": "https://explosion.ai/custom-solutions"
                    }
                ]
            }
--- a/website/meta/site.json
+++ b/website/meta/site.json
@ -51,7 +51,7 @@
                { "text": "Online Course", "url": "https://course.spacy.io" },
                {
                    "text": "Custom Solutions",
-                    "url": "https://explosion.ai/spacy-tailored-pipelines"
+                    "url": "https://explosion.ai/custom-solutions"
                }
            ]
        },
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1024,25 +1024,6 @@
            "category": ["pipeline"],
            "spacy_version": 2
        },
-        {
-            "id": "spacy-sentence-segmenter",
-            "title": "Sentence Segmenter",
-            "slogan": "Custom sentence segmentation for spaCy",
-            "code_example": [
-                "from seg.newline.segmenter import NewLineSegmenter",
-                "import spacy",
-                "",
-                "nlseg = NewLineSegmenter()",
-                "nlp = spacy.load('en')",
-                "nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
-                "doc = nlp(my_doc_text)"
-            ],
-            "author": "tc64",
-            "author_links": {
-                "github": "tc64"
-            },
-            "category": ["pipeline"]
-        },
        {
            "id": "spacy_cld",
            "title": "spaCy-CLD",
@ -1472,13 +1453,26 @@
            "image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
            "code_example": [
                "import spacy",
-                "import scattertext as st",
                "",
-                "nlp = spacy.load('en')",
-                "corpus = st.CorpusFromPandas(convention_df,",
-                "                             category_col='party',",
-                "                             text_col='text',",
-                "                             nlp=nlp).build()"
+                "from scattertext import SampleCorpora, produce_scattertext_explorer",
+                "from scattertext import produce_scattertext_html",
+                "from scattertext.CorpusFromPandas import CorpusFromPandas",
+                "",
+                "nlp = spacy.load('en_core_web_sm')",
+                "convention_df = SampleCorpora.ConventionData2012.get_data()",
+                "corpus = CorpusFromPandas(convention_df,",
+                "                          category_col='party',",
+                "                          text_col='text',",
+                "                          nlp=nlp).build()",
+                "",
+                "html = produce_scattertext_html(corpus,",
+                "                                    category='democrat',",
+                "                                    category_name='Democratic',",
+                "                                    not_category_name='Republican',",
+                "                                    minimum_term_frequency=5,",
+                "                                    width_in_pixels=1000)",
+                "open('./simple.html', 'wb').write(html.encode('utf-8'))",
+                "print('Open ./simple.html in Chrome or Firefox.')"
            ],
            "author": "Jason Kessler",
            "author_links": {
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@ -105,13 +105,13 @@ const Landing = ({ data }) => {

            <LandingBannerGrid>
                <LandingBanner
-                    to="https://explosion.ai/spacy-tailored-pipelines"
+                    to="https://explosion.ai/custom-solutions"
                    button="Learn more"
                    background="#E4F4F9"
                    color="#1e1935"
                    small
                >
-                    <Link to="https://explosion.ai/spacy-tailored-pipelines" hidden>
+                    <Link to="https://explosion.ai/custom-solutions" hidden>
                        <img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
                    </Link>
                    <strong>