Merge remote-tracking branch 'origin/fix/types' into fix/types

2025-07-24 23:19:45 +03:00 · 2023-12-07 13:43:59 +01:00 · 2023-12-07 13:43:59 +01:00 · a2c9eeca0b
commit a2c9eeca0b
parent 47d633a7fb d009f7deb4
32 changed files with 734 additions and 58 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -0,0 +1 @@
 custom: https://explosion.ai/merch
--- a/README.md
+++ b/README.md
@ -46,6 +46,7 @@ open-source software, released under the
 | 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                                    |
 | 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                                 |
 | 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                                        |
 | 👕 **[Swag]**                                                                                                                                                                                                             | Support us and our work with unique, custom-designed swag!                                                                                                                                                                                                                                                                                   |
 | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)**                 |
 | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a>   | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |
@ -61,6 +62,7 @@ open-source software, released under the
 [project templates]: https://github.com/explosion/projects
 [changelog]: https://spacy.io/usage#changelog
 [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 [swag]: https://explosion.ai/merch
 ## 💬 Where to ask questions
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -271,8 +271,9 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
 [components.textcat.model.linear_model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
 length = 262144
 ngram_size = 1
 no_output_layer = false
@ -308,8 +309,9 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
 [components.textcat_multilabel.model.linear_model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = false
 length = 262144
 ngram_size = 1
 no_output_layer = false
@ -542,14 +544,15 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 [components.textcat.model.linear_model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
 length = 262144
 ngram_size = 1
 no_output_layer = false
 {% else -%}
 [components.textcat.model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
 ngram_size = 1
 no_output_layer = false
@ -570,15 +573,17 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 [components.textcat_multilabel.model.linear_model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = false
 length = 262144
 ngram_size = 1
 no_output_layer = false
 {% else -%}
 [components.textcat_multilabel.model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = false
 length = 262144
 ngram_size = 1
 no_output_layer = false
 {%- endif %}
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -983,6 +983,7 @@ class Errors(metaclass=ErrorsWithCodes):
             "predicted docs when training {component}.")
    E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
             "but only callbacks with one or three parameters are supported")
    E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/lang/fo/init.py
+++ b/spacy/lang/fo/init.py
@ -0,0 +1,18 @@
 from ...language import BaseDefaults, Language
 from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class FaroeseDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    prefixes = TOKENIZER_PREFIXES
 class Faroese(Language):
    lang = "fo"
    Defaults = FaroeseDefaults
 __all__ = ["Faroese"]
--- a/spacy/lang/fo/tokenizer_exceptions.py
+++ b/spacy/lang/fo/tokenizer_exceptions.py
@ -0,0 +1,90 @@
 from ...symbols import ORTH
 from ...util import update_exc
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 _exc = {}
 for orth in [
    "apr.",
    "aug.",
    "avgr.",
    "árg.",
    "ávís.",
    "beinl.",
    "blkv.",
    "blaðkv.",
    "blm.",
    "blaðm.",
    "bls.",
    "blstj.",
    "blaðstj.",
    "des.",
    "eint.",
    "febr.",
    "fyrrv.",
    "góðk.",
    "h.m.",
    "innt.",
    "jan.",
    "kl.",
    "m.a.",
    "mðr.",
    "mió.",
    "nr.",
    "nto.",
    "nov.",
    "nút.",
    "o.a.",
    "o.a.m.",
    "o.a.tíl.",
    "o.fl.",
    "ff.",
    "o.m.a.",
    "o.o.",
    "o.s.fr.",
    "o.tíl.",
    "o.ø.",
    "okt.",
    "omf.",
    "pst.",
    "ritstj.",
    "sbr.",
    "sms.",
    "smst.",
    "smb.",
    "sb.",
    "sbrt.",
    "sp.",
    "sept.",
    "spf.",
    "spsk.",
    "t.e.",
    "t.s.",
    "t.s.s.",
    "tlf.",
    "tel.",
    "tsk.",
    "t.o.v.",
    "t.d.",
    "uml.",
    "ums.",
    "uppl.",
    "upprfr.",
    "uppr.",
    "útg.",
    "útl.",
    "útr.",
    "vanl.",
    "v.",
    "v.h.",
    "v.ø.o.",
    "viðm.",
    "viðv.",
    "vm.",
    "v.m.",
 ]:
    _exc[orth] = [{ORTH: orth}]
    capitalized = orth.capitalize()
    _exc[capitalized] = [{ORTH: capitalized}]
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/lang/nn/init.py
+++ b/spacy/lang/nn/init.py
@ -0,0 +1,20 @@
 from ...language import BaseDefaults, Language
 from ..nb import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class NorwegianNynorskDefaults(BaseDefaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    prefixes = TOKENIZER_PREFIXES
    infixes = TOKENIZER_INFIXES
    suffixes = TOKENIZER_SUFFIXES
    syntax_iterators = SYNTAX_ITERATORS
 class NorwegianNynorsk(Language):
    lang = "nn"
    Defaults = NorwegianNynorskDefaults
 __all__ = ["NorwegianNynorsk"]
--- a/spacy/lang/nn/examples.py
+++ b/spacy/lang/nn/examples.py
@ -0,0 +1,15 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.nn.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 # sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
 sentences = [
    "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
    "Det er ein meir enn i same periode i fjor.",
    "Det har lava ned enorme snømengder i store delar av Europa den siste tida.",
    "Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.",
 ]
--- a/spacy/lang/nn/punctuation.py
+++ b/spacy/lang/nn/punctuation.py
@ -0,0 +1,74 @@
 from ..char_classes import (
    ALPHA,
    ALPHA_LOWER,
    ALPHA_UPPER,
    CONCAT_QUOTES,
    CURRENCY,
    LIST_CURRENCY,
    LIST_ELLIPSES,
    LIST_ICONS,
    LIST_PUNCT,
    LIST_QUOTES,
    PUNCT,
    UNITS,
 )
 from ..punctuation import TOKENIZER_SUFFIXES
 _quotes = CONCAT_QUOTES.replace("'", "")
 _list_punct = [x for x in LIST_PUNCT if x != "#"]
 _list_icons = [x for x in LIST_ICONS if x != "°"]
 _list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
 _list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
 _prefixes = (
    ["§", "%", "=", "—", "–", r"\+(?![0-9])"]
    + _list_punct
    + LIST_ELLIPSES
    + LIST_QUOTES
    + LIST_CURRENCY
    + LIST_ICONS
 )
 _infixes = (
    LIST_ELLIPSES
    + _list_icons
    + [
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
    ]
 )
 _suffixes = (
    LIST_PUNCT
    + LIST_ELLIPSES
    + _list_quotes
    + _list_icons
    + ["—", "–"]
    + [
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
        r"(?<=[{al}{e}{p}(?:{q})])\.".format(
            al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
        ),
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
    ]
    + [r"(?<=[^sSxXzZ])'"]
 )
 _suffixes += [
    suffix
    for suffix in TOKENIZER_SUFFIXES
    if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
 ]
 TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_INFIXES = _infixes
 TOKENIZER_SUFFIXES = _suffixes
--- a/spacy/lang/nn/tokenizer_exceptions.py
+++ b/spacy/lang/nn/tokenizer_exceptions.py
@ -0,0 +1,228 @@
 from ...symbols import NORM, ORTH
 from ...util import update_exc
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 _exc = {}
 for exc_data in [
    {ORTH: "jan.", NORM: "januar"},
    {ORTH: "feb.", NORM: "februar"},
    {ORTH: "mar.", NORM: "mars"},
    {ORTH: "apr.", NORM: "april"},
    {ORTH: "jun.", NORM: "juni"},
    # note: "jul." is in the simple list below without a NORM exception
    {ORTH: "aug.", NORM: "august"},
    {ORTH: "sep.", NORM: "september"},
    {ORTH: "okt.", NORM: "oktober"},
    {ORTH: "nov.", NORM: "november"},
    {ORTH: "des.", NORM: "desember"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
    "Ap.",
    "Aq.",
    "Ca.",
    "Chr.",
    "Co.",
    "Dr.",
    "F.eks.",
    "Fr.p.",
    "Frp.",
    "Grl.",
    "Kr.",
    "Kr.F.",
    "Kr.F.s",
    "Mr.",
    "Mrs.",
    "Pb.",
    "Pr.",
    "Sp.",
    "St.",
    "a.m.",
    "ad.",
    "adm.dir.",
    "adr.",
    "b.c.",
    "bl.a.",
    "bla.",
    "bm.",
    "bnr.",
    "bto.",
    "c.c.",
    "ca.",
    "cand.mag.",
    "co.",
    "d.d.",
    "d.m.",
    "d.y.",
    "dept.",
    "dr.",
    "dr.med.",
    "dr.philos.",
    "dr.psychol.",
    "dss.",
    "dvs.",
    "e.Kr.",
    "e.l.",
    "eg.",
    "eig.",
    "ekskl.",
    "el.",
    "et.",
    "etc.",
    "etg.",
    "ev.",
    "evt.",
    "f.",
    "f.Kr.",
    "f.eks.",
    "f.o.m.",
    "fhv.",
    "fk.",
    "foreg.",
    "fork.",
    "fv.",
    "fvt.",
    "g.",
    "gl.",
    "gno.",
    "gnr.",
    "grl.",
    "gt.",
    "h.r.adv.",
    "hhv.",
    "hoh.",
    "hr.",
    "ifb.",
    "ifm.",
    "iht.",
    "inkl.",
    "istf.",
    "jf.",
    "jr.",
    "jul.",
    "juris.",
    "kfr.",
    "kgl.",
    "kgl.res.",
    "kl.",
    "komm.",
    "kr.",
    "kst.",
    "lat.",
    "lø.",
    "m.a.",
    "m.a.o.",
    "m.fl.",
    "m.m.",
    "m.v.",
    "ma.",
    "mag.art.",
    "md.",
    "mfl.",
    "mht.",
    "mill.",
    "min.",
    "mnd.",
    "moh.",
    "mrd.",
    "muh.",
    "mv.",
    "mva.",
    "n.å.",
    "ndf.",
    "nr.",
    "nto.",
    "nyno.",
    "o.a.",
    "o.l.",
    "obl.",
    "off.",
    "ofl.",
    "on.",
    "op.",
    "org.",
    "osv.",
    "ovf.",
    "p.",
    "p.a.",
    "p.g.a.",
    "p.m.",
    "p.t.",
    "pga.",
    "ph.d.",
    "pkt.",
    "pr.",
    "pst.",
    "pt.",
    "red.anm.",
    "ref.",
    "res.",
    "res.kap.",
    "resp.",
    "rv.",
    "s.",
    "s.d.",
    "s.k.",
    "s.u.",
    "s.å.",
    "sen.",
    "sep.",
    "siviling.",
    "sms.",
    "snr.",
    "spm.",
    "sr.",
    "sst.",
    "st.",
    "st.meld.",
    "st.prp.",
    "stip.",
    "stk.",
    "stud.",
    "sv.",
    "såk.",
    "sø.",
    "t.d.",
    "t.h.",
    "t.o.m.",
    "t.v.",
    "temp.",
    "ti.",
    "tils.",
    "tilsv.",
    "tl;dr",
    "tlf.",
    "to.",
    "ult.",
    "utg.",
    "v.",
    "vedk.",
    "vedr.",
    "vg.",
    "vgs.",
    "vha.",
    "vit.ass.",
    "vn.",
    "vol.",
    "vs.",
    "vsa.",
    "§§",
    "©NTB",
    "årg.",
    "årh.",
 ]:
    _exc[orth] = [{ORTH: orth}]
 # Dates
 for h in range(1, 31 + 1):
    for period in ["."]:
        _exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
 _custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
 _exc.update(_custom_base_exc)
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -1,5 +1,5 @@
 from functools import partial
-from typing import List, Optional, cast
+from typing import List, Optional, Tuple, cast
 from thinc.api import (
    Dropout,
@ -12,6 +12,7 @@ from thinc.api import (
    Relu,
    Softmax,
    SparseLinear,
    SparseLinear_v2,
    chain,
    clone,
    concatenate,
@ -25,9 +26,10 @@ from thinc.api import (
 )
 from thinc.layers.chain import init as init_chain
 from thinc.layers.resizable import resize_linear_weighted, resize_model
-from thinc.types import Floats2d
+from thinc.types import ArrayXd, Floats2d
 from ...attrs import ORTH
 from ...errors import Errors
 from ...tokens import Doc
 from ...util import registry
 from ..extract_ngrams import extract_ngrams
@ -95,10 +97,48 @@ def build_bow_text_classifier(
    ngram_size: int,
    no_output_layer: bool,
    nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
    return _build_bow_text_classifier(
        exclusive_classes=exclusive_classes,
        ngram_size=ngram_size,
        no_output_layer=no_output_layer,
        nO=nO,
        sparse_linear=SparseLinear(nO=nO),
    )
@registry.architectures("spacy.TextCatBOW.v3")
 def build_bow_text_classifier_v3(
    exclusive_classes: bool,
    ngram_size: int,
    no_output_layer: bool,
    length: int = 262144,
    nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
    if length < 1:
        raise ValueError(Errors.E1056.format(length=length))
    # Find k such that 2**(k-1) < length <= 2**k.
    length = 2 ** (length - 1).bit_length()
    return _build_bow_text_classifier(
        exclusive_classes=exclusive_classes,
        ngram_size=ngram_size,
        no_output_layer=no_output_layer,
        nO=nO,
        sparse_linear=SparseLinear_v2(nO=nO, length=length),
    )
 def _build_bow_text_classifier(
    exclusive_classes: bool,
    ngram_size: int,
    no_output_layer: bool,
    sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd],
    nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
    fill_defaults = {"b": 0, "W": 0}
    with Model.define_operators({">>": chain}):
        sparse_linear = SparseLinear(nO=nO)
        output_layer = None
        if not no_output_layer:
            fill_defaults["b"] = NEG_VALUE
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -36,8 +36,9 @@ maxout_pieces = 3
 depth = 2
 [model.linear_model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
 length = 262144
 ngram_size = 1
 no_output_layer = false
 """
@ -45,8 +46,9 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m
 single_label_bow_config = """
 [model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
 length = 262144
 ngram_size = 1
 no_output_layer = false
 """
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@ -35,8 +35,9 @@ maxout_pieces = 3
 depth = 2
 [model.linear_model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = false
 length = 262144
 ngram_size = 1
 no_output_layer = false
 """
@ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod
 multi_label_bow_config = """
 [model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = false
 ngram_size = 1
 no_output_layer = false
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -162,6 +162,11 @@ def fi_tokenizer():
    return get_lang_class("fi")().tokenizer
@pytest.fixture(scope="session")
 def fo_tokenizer():
    return get_lang_class("fo")().tokenizer
@pytest.fixture(scope="session")
 def fr_tokenizer():
    return get_lang_class("fr")().tokenizer
@ -317,6 +322,11 @@ def nl_tokenizer():
    return get_lang_class("nl")().tokenizer
@pytest.fixture(scope="session")
 def nn_tokenizer():
    return get_lang_class("nn")().tokenizer
@pytest.fixture(scope="session")
 def pl_tokenizer():
    return get_lang_class("pl")().tokenizer
--- a/spacy/tests/lang/fo/init.py
+++ b/spacy/tests/lang/fo/init.py
--- a/spacy/tests/lang/fo/test_tokenizer.py
+++ b/spacy/tests/lang/fo/test_tokenizer.py
@ -0,0 +1,26 @@
 import pytest
 # examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/)
 # fmt: off
 FO_TOKEN_EXCEPTION_TESTS = [
    (
        "Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ",
        [
            "Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", ".",
        ],
    ),
    (
        "Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.",
        [
            "Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "tá", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", ".",
        ],
    ),
 ]
 # fmt: on
@pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS)
 def test_fo_tokenizer_handles_exception_cases(fo_tokenizer, text, expected_tokens):
    tokens = fo_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
--- a/spacy/tests/lang/nn/init.py
+++ b/spacy/tests/lang/nn/init.py
--- a/spacy/tests/lang/nn/test_tokenizer.py
+++ b/spacy/tests/lang/nn/test_tokenizer.py
@ -0,0 +1,38 @@
 import pytest
 # examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
 # fmt: off
 NN_TOKEN_EXCEPTION_TESTS = [
    (
        "Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.",
        [
            "Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "så", "raskt", "som", "mogleg", "i", "2014", ".",
        ],
    ),
    (
        "Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.",
        [
            "Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "på", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "på", "skulen", "ueinig", "i", ".",
        ],
    ),
    (
        "Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.",
        [
            "Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", ".",
        ],
    ),
    (
        "Brukssesongen er frå nov. til mai, med ein topp i mars.",
        [
            "Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", ".",
        ],
    ),
 ]
 # fmt: on
@pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS)
 def test_nn_tokenizer_handles_exception_cases(nn_tokenizer, text, expected_tokens):
    tokens = nn_tokenizer(text)
    token_list = [token.text for token in tokens if not token.is_space]
    assert expected_tokens == token_list
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@ -203,7 +203,7 @@ def test_pipe_class_component_model():
            "@architectures": "spacy.TextCatEnsemble.v2",
            "tok2vec": DEFAULT_TOK2VEC_MODEL,
            "linear_model": {
-                "@architectures": "spacy.TextCatBOW.v2",
+                "@architectures": "spacy.TextCatBOW.v3",
                "exclusive_classes": False,
                "ngram_size": 1,
                "no_output_layer": False,
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -414,7 +414,7 @@ def test_implicit_label(name, get_examples):
@pytest.mark.parametrize(
    "name,textcat_config",
    [
-        # BOW
+        # BOW V1
        ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
        ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
@ -451,11 +451,11 @@ def test_no_resize(name, textcat_config):
@pytest.mark.parametrize(
    "name,textcat_config",
    [
-        # BOW
+        # BOW V3
-        ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
+        ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
-        ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
+        ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
        # CNN
        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
@ -480,11 +480,11 @@ def test_resize(name, textcat_config):
@pytest.mark.parametrize(
    "name,textcat_config",
    [
-        # BOW
+        # BOW v3
-        ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
+        ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
-        ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
+        ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
        # CNN
        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
@ -693,9 +693,14 @@ def test_overfitting_IO_multi():
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
        # BOW V3
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
        # ENSEMBLE V2
-        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
+        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
-        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
+        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
        # CNN V2
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@ -238,7 +238,7 @@ def test_project_push_pull(project_dir):
 def test_find_function_valid():
    # example of architecture in main code base
-    function = "spacy.TextCatBOW.v2"
+    function = "spacy.TextCatBOW.v3"
    result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
    assert f"Found registered function '{function}'" in result.stdout
    assert "textcat.py" in result.stdout
@ -257,7 +257,7 @@ def test_find_function_valid():
 def test_find_function_invalid():
    # invalid registry
-    function = "spacy.TextCatBOW.v2"
+    function = "spacy.TextCatBOW.v3"
    registry = "foobar"
    result = CliRunner().invoke(
        app, ["find-function", function, "--registry", registry]
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -376,8 +376,9 @@ def test_util_dot_section():
    factory = "textcat"
    [components.textcat.model]
-    @architectures = "spacy.TextCatBOW.v2"
+    @architectures = "spacy.TextCatBOW.v3"
    exclusive_classes = true
    length = 262144
    ngram_size = 1
    no_output_layer = false
    """
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@ -79,7 +79,7 @@ subword features, and a
 consisting of a CNN and a layer-normalized maxout activation function.
 | Name                 | Description                                                                                                                                                                                                                                                                 |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                        |
 | `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                              |
 | `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                          |
@ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the
 > nO = null
 >
 > [model.linear_model]
-> @architectures = "spacy.TextCatBOW.v2"
+> @architectures = "spacy.TextCatBOW.v3"
 > exclusive_classes = true
 > length = 262144
 > ngram_size = 1
 > no_output_layer = false
 >
@ -1057,14 +1058,15 @@ after training.
 </Accordion>
-### spacy.TextCatBOW.v2 {id="TextCatBOW"}
+### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.TextCatBOW.v2"
+> @architectures = "spacy.TextCatBOW.v3"
 > exclusive_classes = false
 > length = 262144
 > ngram_size = 1
 > no_output_layer = false
 > nO = null
@ -1078,14 +1080,19 @@ the others, but may not be as accurate, especially if texts are short.
 | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           |
 | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          |
 | `length`            | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~                                              |
 | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-<Accordion title="spacy.TextCatBOW.v1 definition" spaced>
+<Accordion title="Previous versions of spacy.TextCatBOW" spaced>
-[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was
+- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2,
-not yet resizable. Since v2, new labels can be added to this component, even
+  new labels can be added to this component, even after training.
-after training.
+- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
  [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear
  layer that only used a small number of the allocated parameters.
 - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
  [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument.
 </Accordion>
--- a/website/docs/api/curatedtransformer.mdx
+++ b/website/docs/api/curatedtransformer.mdx
@ -400,6 +400,14 @@ identifiers are grouped by token. Instances of this class are typically assigned
 to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
 attribute.
 > #### Example
 >
 > ```python
 > # Get the last hidden layer output for "is" (token index 1)
 > doc = nlp("This is a text.")
 > tensors = doc._.trf_data.last_hidden_layer_state[1]
 > ```
 | Name              | Description                                                                                                                                                                        |
 | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `all_outputs`     | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@ -198,7 +198,9 @@ architecture is usually less accurate than the ensemble, but runs faster.
 Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
 that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
-yet support that.
+yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an
 erroneous sparse linear layer that only used a small number of the allocated
 parameters.
 > #### Example Config
 >
@ -222,6 +224,33 @@ the others, but may not be as accurate, especially if texts are short.
 | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 ### spacy.TextCatBOW.v2 {id="TextCatBOW"}
 Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse
 linear layer that only used a small number of the allocated parameters.
 > #### Example Config
 >
 > ```ini
 > [model]
 > @architectures = "spacy.TextCatBOW.v2"
 > exclusive_classes = false
 > ngram_size = 1
 > no_output_layer = false
 > nO = null
 > ```
 An n-gram "bag-of-words" model. This architecture should run much faster than
 the others, but may not be as accurate, especially if texts are short.
 | Name                | Description                                                                                                                                                                                    |
 | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           |
 | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          |
 | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 Identical to
--- a/website/docs/api/transformer.mdx
+++ b/website/docs/api/transformer.mdx
@ -397,6 +397,17 @@ are wrapped into the
 by this class. Instances of this class are typically assigned to the
 [`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute.
 > #### Example
 >
 > ```python
 > # Get the last hidden layer output for "is" (token index 1)
 > doc = nlp("This is a text.")
 > indices = doc._.trf_data.align[1].data.flatten()
 > last_hidden_state = doc._.trf_data.model_output.last_hidden_state
 > dim = last_hidden_state.shape[-1]
 > tensors = last_hidden_state.reshape(-1, dim)[indices]
 > ```
 | Name           | Description                                                                                                                                                                                                                                                                                                                          |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `tokens`       | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                       |
--- a/website/docs/models/index.mdx
+++ b/website/docs/models/index.mdx
@ -108,12 +108,12 @@ In the `sm`/`md`/`lg` models:
 #### CNN/CPU pipelines with floret vectors
-The Finnish, Korean and Swedish `md` and `lg` pipelines use
+The Croatian, Finnish, Korean, Slovenian, Swedish and Ukrainian `md` and `lg`
-[floret vectors](/usage/v3-2#vectors) instead of default vectors. If you're
+pipelines use [floret vectors](/usage/v3-2#vectors) instead of default vectors.
-running a trained pipeline on texts and working with [`Doc`](/api/doc) objects,
+If you're running a trained pipeline on texts and working with [`Doc`](/api/doc)
-you shouldn't notice any difference with floret vectors. With floret vectors no
+objects, you shouldn't notice any difference with floret vectors. With floret
-tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will
+vectors no tokens are out-of-vocabulary, so
-return `False` for all tokens.
+[`Token.is_oov`](/api/token#attributes) will return `False` for all tokens.
 If you access vectors directly for similarity comparisons, there are a few
 differences because floret vectors don't include a fixed word list like the
@ -132,10 +132,20 @@ vector keys for default vectors.
 ### Transformer pipeline design {id="design-trf"}
-In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
+In the transformer (`trf`) pipelines, the `tagger`, `parser` and `ner` (if
-all listen to the `transformer` component. The `attribute_ruler` and
+present) all listen to the `transformer` component. The `attribute_ruler` and
 `lemmatizer` have the same configuration as in the CNN models.
 For spaCy v3.0-v3.6, `trf` pipelines use
 [`spacy-transformers`](https://github.com/explosion/spacy-transformers) and the
 transformer output in `doc._.trf_data` is a
 [`TransformerData`](/api/transformer#transformerdata) object.
 For spaCy v3.7+, `trf` pipelines use
 [`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers)
 and `doc._.trf_data` is a
 [`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object.
 ### Modifying the default pipeline {id="design-modify"}
 For faster processing, you may only want to run a subset of the components in a
--- a/website/docs/usage/layers-architectures.mdx
+++ b/website/docs/usage/layers-architectures.mdx
@ -153,8 +153,9 @@ maxout_pieces = 3
 depth = 2
 [components.textcat.model.linear_model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
 length = 262144
 ngram_size = 1
 no_output_layer = false
 ```
@ -170,8 +171,9 @@ factory = "textcat"
 labels = []
 [components.textcat.model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
 length = 262144
 ngram_size = 1
 no_output_layer = false
 nO = null
--- a/website/docs/usage/processing-pipelines.mdx
+++ b/website/docs/usage/processing-pipelines.mdx
@ -1328,8 +1328,9 @@ labels = []
 # This function is created and then passed to the "textcat" component as
 # the argument "model"
 [components.textcat.model]
-@architectures = "spacy.TextCatBOW.v2"
+@architectures = "spacy.TextCatBOW.v3"
 exclusive_classes = true
 length = 262144
 ngram_size = 1
 no_output_layer = false
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -103,6 +103,10 @@
            "has_examples": true,
            "models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"]
        },
        {
            "code": "fo",
            "name": "Faroese"
        },
        {
            "code": "fr",
            "name": "French",
@ -290,6 +294,12 @@
            "example": "Dit is een zin.",
            "has_examples": true
        },
        {
            "code": "nn",
            "name": "Norwegian Nynorsk",
            "example": "Det er ein meir enn i same periode i fjor.",
            "has_examples": true
        },
        {
            "code": "pl",
            "name": "Polish",
--- a/website/meta/site.json
+++ b/website/meta/site.json
@ -66,6 +66,10 @@
                {
                    "text": "Stack Overflow",
                    "url": "http://stackoverflow.com/questions/tagged/spacy"
                },
                {
                    "text": "Merchandise",
                    "url": "https://explosion.ai/merch"
                }
            ]
        },
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -4500,6 +4500,23 @@
                "website": "https://nlp.unibuc.ro/people/snisioi.html"
            },
            "category": ["pipeline", "training", "models"]
        },
        {
            "id": "redfield-spacy-nodes",
            "title": "Redfield NLP Nodes for KNIME",
            "slogan": "Makes the functionality of the spaCy library available in KNIME Analytics Platform.",
            "description": "This extension provides nodes that make the functionality of the spaCy library available in the [KNIME Analytics Platform](https://www.knime.com/).",
            "github": "Redfield-AB/Spacy-Nodes",
            "url": "https://redfield.ai/spacy-redfield/",
            "thumb": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/redfield_logo_100x100.png",
            "image": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/screen1.png",
            "author": "Redfield AB",
            "author_links": {
                "twitter": "Redfield_AB",
                "github": "Redfield-AB",
                "website": "https://redfield.ai"
            },
            "category": ["standalone"]
        }
    ],