Merge branch 'master' into feature/candidate-generation-by-docs

2025-09-07 12:54:56 +03:00 · 2022-12-12 13:46:07 +01:00 · 2022-12-12 13:46:07 +01:00 · 77680421b4
commit 77680421b4
parent 2870c8f4d6 e5c7f3b077
27 changed files with 217 additions and 81 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -107,7 +107,7 @@ steps:
    displayName: "Run CPU tests"
  - script: |
-      python -m pip install --pre thinc-apple-ops
+      python -m pip install 'spacy[apple]'
      python -m pytest --pyargs spacy
    displayName: "Run CPU tests with thinc-apple-ops"
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@ -15,11 +15,11 @@ jobs:
  action:
    runs-on: ubuntu-latest
    steps:
-      - uses: dessant/lock-threads@v3
+      - uses: dessant/lock-threads@v4
        with:
          process-only: 'issues'
          issue-inactive-days: '30'
-          issue-comment: > 
+          issue-comment: >
-            This thread has been automatically locked since there 
+            This thread has been automatically locked since there
-            has not been any recent activity after it was closed. 
+            has not been any recent activity after it was closed.
            Please open a new issue for related bugs.
--- a/README.md
+++ b/README.md
@ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more,
 multi-task learning with pretrained **transformers** like BERT, as well as a
 production-ready [**training system**](https://spacy.io/usage/training) and easy
 model packaging, deployment and workflow management. spaCy is commercial
-open-source software, released under the MIT license.
+open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
 💫 **Version 3.4 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
 | 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                 |
 | 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                        |
 | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
 | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |
 [spacy 101]: https://spacy.io/usage/spacy-101
 [new in v3.0]: https://spacy.io/usage/v3
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
 [changelog]: https://spacy.io/usage#changelog
 [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 ## 💬 Where to ask questions
 The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -41,7 +41,7 @@ jobs:
      matrix:
        # We're only running one platform per Python version to speed up builds
        Python36Linux:
-          imageName: "ubuntu-latest"
+          imageName: "ubuntu-20.04"
          python.version: "3.6"
        #        Python36Windows:
        #          imageName: "windows-latest"
@ -50,7 +50,7 @@ jobs:
        #          imageName: "macos-latest"
        #          python.version: "3.6"
        #        Python37Linux:
-        #          imageName: "ubuntu-latest"
+        #          imageName: "ubuntu-20.04"
        #          python.version: "3.7"
        Python37Windows:
          imageName: "windows-latest"
--- a/requirements.txt
+++ b/requirements.txt
@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0
 thinc>=8.1.0,<8.2.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.9.1,<1.1.0
+wasabi>=0.9.1,<1.2.0
 srsly>=2.4.3,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.8.0
--- a/setup.cfg
+++ b/setup.cfg
@ -47,7 +47,7 @@ install_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    thinc>=8.1.0,<8.2.0
-    wasabi>=0.9.1,<1.1.0
+    wasabi>=0.9.1,<1.2.0
    srsly>=2.4.3,<3.0.0
    catalogue>=2.0.6,<2.1.0
    # Third-party dependencies
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -158,15 +158,15 @@ def load_project_config(
        sys.exit(1)
    validate_project_version(config)
    validate_project_commands(config)
    if interpolate:
        err = f"{PROJECT_FILE} validation error"
        with show_validation_error(title=err, hint_fill=False):
            config = substitute_project_variables(config, overrides)
    # Make sure directories defined in config exist
    for subdir in config.get("directories", []):
        dir_path = path / subdir
        if not dir_path.exists():
            dir_path.mkdir(parents=True)
    if interpolate:
        err = f"{PROJECT_FILE} validation error"
        with show_validation_error(title=err, hint_fill=False):
            config = substitute_project_variables(config, overrides)
    return config
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -101,8 +101,8 @@ def project_run(
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
-                err_kwargs = {"exits": 1} if not dry else {}
+                err_exits = 1 if not dry else None
-                msg.fail(err, err_help, **err_kwargs)
+                msg.fail(err, err_help, exits=err_exits)
        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
        with working_dir(project_dir) as current_dir:
            msg.divider(subcommand)
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -1,7 +1,7 @@
 {# This is a template for training configs used for the quickstart widget in
 the docs and the init config command. It encodes various best practices and
 can help generate the best possible configuration, given a user's requirements. #}
-{%- set use_transformer = hardware != "cpu" -%}
+{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
 [paths]
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -345,6 +345,11 @@ class Errors(metaclass=ErrorsWithCodes):
            "clear the existing vectors and resize the table.")
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
            "to end with the attribute {attr}. Got: {bad_attr}.")
    E079 = ("Error computing states in beam: number of predicted beams "
            "({pbeams}) does not equal number of gold beams ({gbeams}).")
    E080 = ("Duplicate state found in beam: {key}.")
    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
            "does not equal number of losses ({losses}).")
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
            "match.")
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -328,9 +328,9 @@ class EditTreeLemmatizer(TrainablePipe):
            tree = dict(tree)
            if "orig" in tree:
-                tree["orig"] = self.vocab.strings[tree["orig"]]
+                tree["orig"] = self.vocab.strings.add(tree["orig"])
            if "orig" in tree:
-                tree["subst"] = self.vocab.strings[tree["subst"]]
+                tree["subst"] = self.vocab.strings.add(tree["subst"])
            trees.append(tree)
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -272,7 +272,10 @@ class SpanCategorizer(TrainablePipe):
        DOCS: https://spacy.io/api/spancategorizer#predict
        """
        indices = self.suggester(docs, ops=self.model.ops)
-        scores = self.model.predict((docs, indices))  # type: ignore
+        if indices.lengths.sum() == 0:
            scores = self.model.ops.alloc2f(0, 0)
        else:
            scores = self.model.predict((docs, indices))  # type: ignore
        return indices, scores
    def set_candidates(
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
    # head before start
    arr = doc.to_array(["HEAD"])
-    arr[0] = -1
+    arr[0] = numpy.int32(-1).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
    # head after end
    arr = doc.to_array(["HEAD"])
-    arr[0] = 5
+    arr[0] = numpy.int32(5).astype(numpy.uint64)
    doc_from_array = Doc(en_vocab, words=words)
    with pytest.raises(ValueError):
        doc_from_array.from_array(["HEAD"], arr)
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@ -60,10 +60,45 @@ def test_initialize_from_labels():
    nlp2 = Language()
    lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
    lemmatizer2.initialize(
-        get_examples=lambda: train_examples,
+        # We want to check that the strings in replacement nodes are
        # added to the string store. Avoid that they get added through
        # the examples.
        get_examples=lambda: train_examples[:1],
        labels=lemmatizer.label_data,
    )
    assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
    assert lemmatizer2.label_data == {
        "trees": [
            {"orig": "S", "subst": "s"},
            {
                "prefix_len": 1,
                "suffix_len": 0,
                "prefix_tree": 0,
                "suffix_tree": 4294967295,
            },
            {"orig": "s", "subst": ""},
            {
                "prefix_len": 0,
                "suffix_len": 1,
                "prefix_tree": 4294967295,
                "suffix_tree": 2,
            },
            {
                "prefix_len": 0,
                "suffix_len": 0,
                "prefix_tree": 4294967295,
                "suffix_tree": 4294967295,
            },
            {"orig": "E", "subst": "e"},
            {
                "prefix_len": 1,
                "suffix_len": 0,
                "prefix_tree": 5,
                "suffix_tree": 4294967295,
            },
        ],
        "labels": (1, 3, 4, 6),
    }
 def test_no_data():
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():
 def test_zero_suggestions():
-    # Test with a suggester that returns 0 suggestions
+    # Test with a suggester that can return 0 suggestions
-    @registry.misc("test_zero_suggester")
+    @registry.misc("test_mixed_zero_suggester")
-    def make_zero_suggester():
+    def make_mixed_zero_suggester():
-        def zero_suggester(docs, *, ops=None):
+        def mixed_zero_suggester(docs, *, ops=None):
            if ops is None:
                ops = get_current_ops()
-            return Ragged(
+            spans = []
-                ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
+            lengths = []
-            )
+            for doc in docs:
                if len(doc) > 0 and len(doc) % 2 == 0:
                    spans.append((0, 1))
                    lengths.append(1)
                else:
                    lengths.append(0)
            spans = ops.asarray2i(spans)
            lengths_array = ops.asarray1i(lengths)
            if len(spans) > 0:
                output = Ragged(ops.xp.vstack(spans), lengths_array)
            else:
                output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
            return output
-        return zero_suggester
+        return mixed_zero_suggester
    fix_random_seed(0)
    nlp = English()
    spancat = nlp.add_pipe(
        "spancat",
-        config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
+        config={
            "suggester": {"@misc": "test_mixed_zero_suggester"},
            "spans_key": SPAN_KEY,
        },
    )
    train_examples = make_examples(nlp)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
@ -397,6 +412,16 @@ def test_zero_suggestions():
    assert set(spancat.labels) == {"LOC", "PERSON"}
    nlp.update(train_examples, sgd=optimizer)
    # empty doc
    nlp("")
    # single doc with zero suggestions
    nlp("one")
    # single doc with one suggestion
    nlp("two two")
    # batch with mixed zero/one suggestions
    list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
    # batch with no suggestions
    list(nlp.pipe(["", "one", "three three three"]))
 def test_set_candidates():
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -123,6 +123,25 @@ def test_issue7055():
    assert "model" in filled_cfg["components"]["ner"]
@pytest.mark.issue(11235)
 def test_issue11235():
    """
    Test that the cli handles interpolation in the directory names correctly when loading project config.
    """
    lang_var = "en"
    variables = {"lang": lang_var}
    commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
    directories = ["cfg", "${vars.lang}_model"]
    project = {"commands": commands, "vars": variables, "directories": directories}
    with make_tempdir() as d:
        srsly.write_yaml(d / "project.yml", project)
        cfg = load_project_config(d)
        # Check that the directories are interpolated and created correctly
        assert os.path.exists(d / "cfg")
        assert os.path.exists(d / f"{lang_var}_model")
    assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
 def test_cli_info():
    nlp = Dutch()
    nlp.add_pipe("textcat")
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -359,6 +359,7 @@ cdef class Doc:
            for annot in annotations:
                if annot:
                    if annot is heads or annot is sent_starts or annot is ent_iobs:
                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                        for i in range(len(words)):
                            if attrs.ndim == 1:
                                attrs[i] = annot[i]
@ -1558,6 +1559,7 @@ cdef class Doc:
            for j, (attr, annot) in enumerate(token_annotations.items()):
                if attr is HEAD:
                    annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
                    for i in range(len(words)):
                        array[i, j] = annot[i]
                elif attr is MORPH:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -299,7 +299,7 @@ cdef class Span:
                    for ancestor in ancestors:
                        ancestor_i = ancestor.i - self.c.start
                        if ancestor_i in range(length):
-                            array[i, head_col] = ancestor_i - i
+                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
                # if there is no appropriate ancestor, define a new artificial root
                value = array[i, head_col]
@ -307,7 +307,7 @@ cdef class Span:
                    new_root = old_to_new_root.get(ancestor_i, None)
                    if new_root is not None:
                        # take the same artificial root as a previous token from the same sentence
-                        array[i, head_col] = new_root - i
+                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
                    else:
                        # set this token as the new artificial root
                        array[i, head_col] = 0
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
        if key not in IDS:
            raise ValueError(Errors.E974.format(obj="token", key=key))
        elif key in ["ORTH", "SPACY"]:
-            pass
+            continue
        elif key == "HEAD":
            attrs.append(key)
-            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
        elif key == "DEP":
            attrs.append(key)
-            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
        elif key == "SENT_START":
            attrs.append(key)
-            values.append([to_ternary_int(v) for v in value])
+            row = [to_ternary_int(v) for v in value]
        elif key == "MORPH":
            attrs.append(key)
-            values.append([vocab.morphology.add(v) for v in value])
+            row = [vocab.morphology.add(v) for v in value]
        else:
            attrs.append(key)
            if not all(isinstance(v, str) for v in value):
                types = set([type(v) for v in value])
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
-            values.append([vocab.strings.add(v) for v in value])
+            row = [vocab.strings.add(v) for v in value]
-    array = numpy.asarray(values, dtype="uint64")
+        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
    array = numpy.array(values, dtype=numpy.uint64)
    return attrs, array.T
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1643,7 +1643,9 @@ def _pipe(
    docs: Iterable["Doc"],
    proc: "PipeCallable",
    name: str,
-    default_error_handler: Callable[[str, "PipeCallable", List["Doc"], Exception], NoReturn],
+    default_error_handler: Callable[
        [str, "PipeCallable", List["Doc"], Exception], NoReturn
    ],
    kwargs: Mapping[str, Any],
 ) -> Iterator["Doc"]:
    if hasattr(proc, "pipe"):
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -1004,6 +1004,54 @@ This method was previously available as `spacy.gold.spans_from_biluo_tags`.
 | `tags`      | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
 | **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~                                                                                                                                                                                        |
 ### training.biluo_to_iob {#biluo_to_iob tag="function"}
 Convert a sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags to
 [IOB](/usage/linguistic-features#accessing-ner) tags. This is useful if you want
 use the BILUO tags with a model that only supports IOB tags.
 > #### Example
 >
 > ```python
 > from spacy.training import biluo_to_iob
 >
 > tags = ["O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 > iob_tags = biluo_to_iob(tags)
 > assert iob_tags == ["O", "O", "B-LOC", "I-LOC", "I-LOC", "O"]
 > ```
 | Name        | Description                                                                             |
 | ----------- | --------------------------------------------------------------------------------------- |
 | `tags`      | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
 | **RETURNS** | A list of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~           |
 ### training.iob_to_biluo {#iob_to_biluo tag="function"}
 Convert a sequence of [IOB](/usage/linguistic-features#accessing-ner) tags to
 [BILUO](/usage/linguistic-features#accessing-ner) tags. This is useful if you
 want use the IOB tags with a model that only supports BILUO tags.
 <Infobox title="Changed in v3.0" variant="warning" id="iob_to_biluo">
 This method was previously available as `spacy.gold.iob_to_biluo`.
 </Infobox>
 > #### Example
 >
 > ```python
 > from spacy.training import iob_to_biluo
 >
 > tags = ["O", "O", "B-LOC", "I-LOC", "O"]
 > biluo_tags = iob_to_biluo(tags)
 > assert biluo_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
 > ```
 | Name        | Description                                                                           |
 | ----------- | ------------------------------------------------------------------------------------- |
 | `tags`      | A sequence of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
 | **RETURNS** | A list of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~       |
 ## Utility functions {#util source="spacy/util.py"}
 spaCy comes with a small collection of utility functions located in
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@ -308,14 +308,14 @@ Load state from a binary string.
 > assert type(PERSON) == int
 > ```
-| Name                                           | Description                                                                                                                                                            |
+| Name                                           | Description                                                                                                                                                             |
-| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ---------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `strings`                                      | A table managing the string-to-int mapping. ~~StringStore~~                                                                                                            |
+| `strings`                                      | A table managing the string-to-int mapping. ~~StringStore~~                                                                                                             |
-| `vectors`                                      | A table associating word IDs to word vectors. ~~Vectors~~                                                                                                              |
+| `vectors`                                      | A table associating word IDs to word vectors. ~~Vectors~~                                                                                                               |
-| `vectors_length`                               | Number of dimensions for each word vector. ~~int~~                                                                                                                     |
+| `vectors_length`                               | Number of dimensions for each word vector. ~~int~~                                                                                                                      |
-| `lookups`                                      | The available lookup tables in this vocab. ~~Lookups~~                                                                                                                 |
+| `lookups`                                      | The available lookup tables in this vocab. ~~Lookups~~                                                                                                                  |
-| `writing_system`                               | A dict with information about the language's writing system. ~~Dict[str, Any]~~                                                                                        |
+| `writing_system`                               | A dict with information about the language's writing system. ~~Dict[str, Any]~~                                                                                         |
-| `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
+| `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
 ## Serialization fields {#serialization-fields}
--- a/website/docs/usage/v3-4.md
+++ b/website/docs/usage/v3-4.md
@ -66,8 +66,8 @@ The English CNN pipelines have new word vectors:
 | Package                                         | Model Version |  TAG | Parser LAS | NER F |
 | ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
 | [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0        | 97.3 |       90.1 |  84.6 |
-| [`en_core_web_md`](/models/en#en_core_web_lg) | v3.4.0        | 97.2 |       90.3 |  85.5 |
+| [`en_core_web_md`](/models/en#en_core_web_md) | v3.4.0        | 97.2 |       90.3 |  85.5 |
-| [`en_core_web_lg`](/models/en#en_core_web_md) | v3.3.0        | 97.4 |       90.1 |  85.3 |
+| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.3.0        | 97.4 |       90.1 |  85.3 |
 | [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0        | 97.3 |       90.2 |  85.6 |
 ## Notes about upgrading from v3.3 {#upgrading}
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@ -45,7 +45,7 @@
                    { "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
                    {
                        "text": "Custom Solutions",
-                        "url": "https://explosion.ai/spacy-tailored-pipelines"
+                        "url": "https://explosion.ai/custom-solutions"
                    }
                ]
            }
--- a/website/meta/site.json
+++ b/website/meta/site.json
@ -51,7 +51,7 @@
                { "text": "Online Course", "url": "https://course.spacy.io" },
                {
                    "text": "Custom Solutions",
-                    "url": "https://explosion.ai/spacy-tailored-pipelines"
+                    "url": "https://explosion.ai/custom-solutions"
                }
            ]
        },
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1023,25 +1023,6 @@
            },
            "category": ["pipeline"]
        },
        {
            "id": "spacy-sentence-segmenter",
            "title": "Sentence Segmenter",
            "slogan": "Custom sentence segmentation for spaCy",
            "code_example": [
                "from seg.newline.segmenter import NewLineSegmenter",
                "import spacy",
                "",
                "nlseg = NewLineSegmenter()",
                "nlp = spacy.load('en')",
                "nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
                "doc = nlp(my_doc_text)"
            ],
            "author": "tc64",
            "author_links": {
                "github": "tc64"
            },
            "category": ["pipeline"]
        },
        {
            "id": "spacy_cld",
            "title": "spaCy-CLD",
@ -1468,13 +1449,26 @@
            "image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
            "code_example": [
                "import spacy",
                "import scattertext as st",
                "",
-                "nlp = spacy.load('en')",
+                "from scattertext import SampleCorpora, produce_scattertext_explorer",
-                "corpus = st.CorpusFromPandas(convention_df,",
+                "from scattertext import produce_scattertext_html",
-                "                             category_col='party',",
+                "from scattertext.CorpusFromPandas import CorpusFromPandas",
-                "                             text_col='text',",
+                "",
-                "                             nlp=nlp).build()"
+                "nlp = spacy.load('en_core_web_sm')",
                "convention_df = SampleCorpora.ConventionData2012.get_data()",
                "corpus = CorpusFromPandas(convention_df,",
                "                          category_col='party',",
                "                          text_col='text',",
                "                          nlp=nlp).build()",
                "",
                "html = produce_scattertext_html(corpus,",
                "                                    category='democrat',",
                "                                    category_name='Democratic',",
                "                                    not_category_name='Republican',",
                "                                    minimum_term_frequency=5,",
                "                                    width_in_pixels=1000)",
                "open('./simple.html', 'wb').write(html.encode('utf-8'))",
                "print('Open ./simple.html in Chrome or Firefox.')"
            ],
            "author": "Jason Kessler",
            "author_links": {
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@ -105,13 +105,13 @@ const Landing = ({ data }) => {
            <LandingBannerGrid>
                <LandingBanner
-                    to="https://explosion.ai/spacy-tailored-pipelines"
+                    to="https://explosion.ai/custom-solutions"
                    button="Learn more"
                    background="#E4F4F9"
                    color="#1e1935"
                    small
                >
-                    <Link to="https://explosion.ai/spacy-tailored-pipelines" hidden>
+                    <Link to="https://explosion.ai/custom-solutions" hidden>
                        <img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" />
                    </Link>
                    <strong>