Merge remote-tracking branch 'origin/fix/types' into fix/types

This commit is contained in:
svlandeg 2023-12-07 13:43:59 +01:00
commit a2c9eeca0b
32 changed files with 734 additions and 58 deletions

1
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1 @@
custom: https://explosion.ai/merch

View File

@ -46,6 +46,7 @@ open-source software, released under the
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
| 🛠 **[Changelog]** | Changes and version history. | | 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | | 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! |
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** | | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** | | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |
@ -61,6 +62,7 @@ open-source software, released under the
[project templates]: https://github.com/explosion/projects [project templates]: https://github.com/explosion/projects
[changelog]: https://spacy.io/usage#changelog [changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
[swag]: https://explosion.ai/merch
## 💬 Where to ask questions ## 💬 Where to ask questions

View File

@ -271,8 +271,9 @@ grad_factor = 1.0
@layers = "reduce_mean.v1" @layers = "reduce_mean.v1"
[components.textcat.model.linear_model] [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
@ -308,8 +309,9 @@ grad_factor = 1.0
@layers = "reduce_mean.v1" @layers = "reduce_mean.v1"
[components.textcat_multilabel.model.linear_model] [components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
@ -542,14 +544,15 @@ nO = null
width = ${components.tok2vec.model.encode.width} width = ${components.tok2vec.model.encode.width}
[components.textcat.model.linear_model] [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
{% else -%} {% else -%}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
@ -570,15 +573,17 @@ nO = null
width = ${components.tok2vec.model.encode.width} width = ${components.tok2vec.model.encode.width}
[components.textcat_multilabel.model.linear_model] [components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
{% else -%} {% else -%}
[components.textcat_multilabel.model] [components.textcat_multilabel.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
{%- endif %} {%- endif %}

View File

@ -983,6 +983,7 @@ class Errors(metaclass=ErrorsWithCodes):
"predicted docs when training {component}.") "predicted docs when training {component}.")
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, " E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
"but only callbacks with one or three parameters are supported") "but only callbacks with one or three parameters are supported")
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

18
spacy/lang/fo/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from ...language import BaseDefaults, Language
from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class FaroeseDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
prefixes = TOKENIZER_PREFIXES
class Faroese(Language):
lang = "fo"
Defaults = FaroeseDefaults
__all__ = ["Faroese"]

View File

@ -0,0 +1,90 @@
from ...symbols import ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
for orth in [
"apr.",
"aug.",
"avgr.",
"árg.",
"ávís.",
"beinl.",
"blkv.",
"blaðkv.",
"blm.",
"blaðm.",
"bls.",
"blstj.",
"blaðstj.",
"des.",
"eint.",
"febr.",
"fyrrv.",
"góðk.",
"h.m.",
"innt.",
"jan.",
"kl.",
"m.a.",
"mðr.",
"mió.",
"nr.",
"nto.",
"nov.",
"nút.",
"o.a.",
"o.a.m.",
"o.a.tíl.",
"o.fl.",
"ff.",
"o.m.a.",
"o.o.",
"o.s.fr.",
"o.tíl.",
"o.ø.",
"okt.",
"omf.",
"pst.",
"ritstj.",
"sbr.",
"sms.",
"smst.",
"smb.",
"sb.",
"sbrt.",
"sp.",
"sept.",
"spf.",
"spsk.",
"t.e.",
"t.s.",
"t.s.s.",
"tlf.",
"tel.",
"tsk.",
"t.o.v.",
"t.d.",
"uml.",
"ums.",
"uppl.",
"upprfr.",
"uppr.",
"útg.",
"útl.",
"útr.",
"vanl.",
"v.",
"v.h.",
"v.ø.o.",
"viðm.",
"viðv.",
"vm.",
"v.m.",
]:
_exc[orth] = [{ORTH: orth}]
capitalized = orth.capitalize()
_exc[capitalized] = [{ORTH: capitalized}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

20
spacy/lang/nn/__init__.py Normal file
View File

@ -0,0 +1,20 @@
from ...language import BaseDefaults, Language
from ..nb import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class NorwegianNynorskDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
class NorwegianNynorsk(Language):
lang = "nn"
Defaults = NorwegianNynorskDefaults
__all__ = ["NorwegianNynorsk"]

15
spacy/lang/nn/examples.py Normal file
View File

@ -0,0 +1,15 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.nn.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
# sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
sentences = [
"Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
"Det er ein meir enn i same periode i fjor.",
"Det har lava ned enorme snømengder i store delar av Europa den siste tida.",
"Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.",
]

View File

@ -0,0 +1,74 @@
from ..char_classes import (
ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
CURRENCY,
LIST_CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
PUNCT,
UNITS,
)
from ..punctuation import TOKENIZER_SUFFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
_list_punct = [x for x in LIST_PUNCT if x != "#"]
_list_icons = [x for x in LIST_ICONS if x != "°"]
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
_list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
_prefixes = (
["§", "%", "=", "", "", r"\+(?![0-9])"]
+ _list_punct
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_CURRENCY
+ LIST_ICONS
)
_infixes = (
LIST_ELLIPSES
+ _list_icons
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
]
)
_suffixes = (
LIST_PUNCT
+ LIST_ELLIPSES
+ _list_quotes
+ _list_icons
+ ["", ""]
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]
+ [r"(?<=[^sSxXzZ])'"]
)
_suffixes += [
suffix
for suffix in TOKENIZER_SUFFIXES
if suffix not in ["'s", "'S", "s", "S", r"\'"]
]
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -0,0 +1,228 @@
from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
for exc_data in [
{ORTH: "jan.", NORM: "januar"},
{ORTH: "feb.", NORM: "februar"},
{ORTH: "mar.", NORM: "mars"},
{ORTH: "apr.", NORM: "april"},
{ORTH: "jun.", NORM: "juni"},
# note: "jul." is in the simple list below without a NORM exception
{ORTH: "aug.", NORM: "august"},
{ORTH: "sep.", NORM: "september"},
{ORTH: "okt.", NORM: "oktober"},
{ORTH: "nov.", NORM: "november"},
{ORTH: "des.", NORM: "desember"},
]:
_exc[exc_data[ORTH]] = [exc_data]
for orth in [
"Ap.",
"Aq.",
"Ca.",
"Chr.",
"Co.",
"Dr.",
"F.eks.",
"Fr.p.",
"Frp.",
"Grl.",
"Kr.",
"Kr.F.",
"Kr.F.s",
"Mr.",
"Mrs.",
"Pb.",
"Pr.",
"Sp.",
"St.",
"a.m.",
"ad.",
"adm.dir.",
"adr.",
"b.c.",
"bl.a.",
"bla.",
"bm.",
"bnr.",
"bto.",
"c.c.",
"ca.",
"cand.mag.",
"co.",
"d.d.",
"d.m.",
"d.y.",
"dept.",
"dr.",
"dr.med.",
"dr.philos.",
"dr.psychol.",
"dss.",
"dvs.",
"e.Kr.",
"e.l.",
"eg.",
"eig.",
"ekskl.",
"el.",
"et.",
"etc.",
"etg.",
"ev.",
"evt.",
"f.",
"f.Kr.",
"f.eks.",
"f.o.m.",
"fhv.",
"fk.",
"foreg.",
"fork.",
"fv.",
"fvt.",
"g.",
"gl.",
"gno.",
"gnr.",
"grl.",
"gt.",
"h.r.adv.",
"hhv.",
"hoh.",
"hr.",
"ifb.",
"ifm.",
"iht.",
"inkl.",
"istf.",
"jf.",
"jr.",
"jul.",
"juris.",
"kfr.",
"kgl.",
"kgl.res.",
"kl.",
"komm.",
"kr.",
"kst.",
"lat.",
"lø.",
"m.a.",
"m.a.o.",
"m.fl.",
"m.m.",
"m.v.",
"ma.",
"mag.art.",
"md.",
"mfl.",
"mht.",
"mill.",
"min.",
"mnd.",
"moh.",
"mrd.",
"muh.",
"mv.",
"mva.",
"n.å.",
"ndf.",
"nr.",
"nto.",
"nyno.",
"o.a.",
"o.l.",
"obl.",
"off.",
"ofl.",
"on.",
"op.",
"org.",
"osv.",
"ovf.",
"p.",
"p.a.",
"p.g.a.",
"p.m.",
"p.t.",
"pga.",
"ph.d.",
"pkt.",
"pr.",
"pst.",
"pt.",
"red.anm.",
"ref.",
"res.",
"res.kap.",
"resp.",
"rv.",
"s.",
"s.d.",
"s.k.",
"s.u.",
"s.å.",
"sen.",
"sep.",
"siviling.",
"sms.",
"snr.",
"spm.",
"sr.",
"sst.",
"st.",
"st.meld.",
"st.prp.",
"stip.",
"stk.",
"stud.",
"sv.",
"såk.",
"sø.",
"t.d.",
"t.h.",
"t.o.m.",
"t.v.",
"temp.",
"ti.",
"tils.",
"tilsv.",
"tl;dr",
"tlf.",
"to.",
"ult.",
"utg.",
"v.",
"vedk.",
"vedr.",
"vg.",
"vgs.",
"vha.",
"vit.ass.",
"vn.",
"vol.",
"vs.",
"vsa.",
"§§",
"©NTB",
"årg.",
"årh.",
]:
_exc[orth] = [{ORTH: orth}]
# Dates
for h in range(1, 31 + 1):
for period in ["."]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
_custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
_exc.update(_custom_base_exc)
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,5 +1,5 @@
from functools import partial from functools import partial
from typing import List, Optional, cast from typing import List, Optional, Tuple, cast
from thinc.api import ( from thinc.api import (
Dropout, Dropout,
@ -12,6 +12,7 @@ from thinc.api import (
Relu, Relu,
Softmax, Softmax,
SparseLinear, SparseLinear,
SparseLinear_v2,
chain, chain,
clone, clone,
concatenate, concatenate,
@ -25,9 +26,10 @@ from thinc.api import (
) )
from thinc.layers.chain import init as init_chain from thinc.layers.chain import init as init_chain
from thinc.layers.resizable import resize_linear_weighted, resize_model from thinc.layers.resizable import resize_linear_weighted, resize_model
from thinc.types import Floats2d from thinc.types import ArrayXd, Floats2d
from ...attrs import ORTH from ...attrs import ORTH
from ...errors import Errors
from ...tokens import Doc from ...tokens import Doc
from ...util import registry from ...util import registry
from ..extract_ngrams import extract_ngrams from ..extract_ngrams import extract_ngrams
@ -95,10 +97,48 @@ def build_bow_text_classifier(
ngram_size: int, ngram_size: int,
no_output_layer: bool, no_output_layer: bool,
nO: Optional[int] = None, nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
return _build_bow_text_classifier(
exclusive_classes=exclusive_classes,
ngram_size=ngram_size,
no_output_layer=no_output_layer,
nO=nO,
sparse_linear=SparseLinear(nO=nO),
)
@registry.architectures("spacy.TextCatBOW.v3")
def build_bow_text_classifier_v3(
exclusive_classes: bool,
ngram_size: int,
no_output_layer: bool,
length: int = 262144,
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
if length < 1:
raise ValueError(Errors.E1056.format(length=length))
# Find k such that 2**(k-1) < length <= 2**k.
length = 2 ** (length - 1).bit_length()
return _build_bow_text_classifier(
exclusive_classes=exclusive_classes,
ngram_size=ngram_size,
no_output_layer=no_output_layer,
nO=nO,
sparse_linear=SparseLinear_v2(nO=nO, length=length),
)
def _build_bow_text_classifier(
exclusive_classes: bool,
ngram_size: int,
no_output_layer: bool,
sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd],
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]: ) -> Model[List[Doc], Floats2d]:
fill_defaults = {"b": 0, "W": 0} fill_defaults = {"b": 0, "W": 0}
with Model.define_operators({">>": chain}): with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO=nO)
output_layer = None output_layer = None
if not no_output_layer: if not no_output_layer:
fill_defaults["b"] = NEG_VALUE fill_defaults["b"] = NEG_VALUE

View File

@ -36,8 +36,9 @@ maxout_pieces = 3
depth = 2 depth = 2
[model.linear_model] [model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """
@ -45,8 +46,9 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m
single_label_bow_config = """ single_label_bow_config = """
[model] [model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """

View File

@ -35,8 +35,9 @@ maxout_pieces = 3
depth = 2 depth = 2
[model.linear_model] [model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """
@ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod
multi_label_bow_config = """ multi_label_bow_config = """
[model] [model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false

View File

@ -162,6 +162,11 @@ def fi_tokenizer():
return get_lang_class("fi")().tokenizer return get_lang_class("fi")().tokenizer
@pytest.fixture(scope="session")
def fo_tokenizer():
return get_lang_class("fo")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def fr_tokenizer(): def fr_tokenizer():
return get_lang_class("fr")().tokenizer return get_lang_class("fr")().tokenizer
@ -317,6 +322,11 @@ def nl_tokenizer():
return get_lang_class("nl")().tokenizer return get_lang_class("nl")().tokenizer
@pytest.fixture(scope="session")
def nn_tokenizer():
return get_lang_class("nn")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def pl_tokenizer(): def pl_tokenizer():
return get_lang_class("pl")().tokenizer return get_lang_class("pl")().tokenizer

View File

View File

@ -0,0 +1,26 @@
import pytest
# examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/)
# fmt: off
FO_TOKEN_EXCEPTION_TESTS = [
(
"Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ",
[
"Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", ".",
],
),
(
"Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.",
[
"Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", ".",
],
),
]
# fmt: on
@pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS)
def test_fo_tokenizer_handles_exception_cases(fo_tokenizer, text, expected_tokens):
tokens = fo_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

View File

@ -0,0 +1,38 @@
import pytest
# examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
# fmt: off
NN_TOKEN_EXCEPTION_TESTS = [
(
"Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.",
[
"Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "", "raskt", "som", "mogleg", "i", "2014", ".",
],
),
(
"Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.",
[
"Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "", "skulen", "ueinig", "i", ".",
],
),
(
"Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.",
[
"Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", ".",
],
),
(
"Brukssesongen er frå nov. til mai, med ein topp i mars.",
[
"Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", ".",
],
),
]
# fmt: on
@pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS)
def test_nn_tokenizer_handles_exception_cases(nn_tokenizer, text, expected_tokens):
tokens = nn_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

@ -203,7 +203,7 @@ def test_pipe_class_component_model():
"@architectures": "spacy.TextCatEnsemble.v2", "@architectures": "spacy.TextCatEnsemble.v2",
"tok2vec": DEFAULT_TOK2VEC_MODEL, "tok2vec": DEFAULT_TOK2VEC_MODEL,
"linear_model": { "linear_model": {
"@architectures": "spacy.TextCatBOW.v2", "@architectures": "spacy.TextCatBOW.v3",
"exclusive_classes": False, "exclusive_classes": False,
"ngram_size": 1, "ngram_size": 1,
"no_output_layer": False, "no_output_layer": False,

View File

@ -414,7 +414,7 @@ def test_implicit_label(name, get_examples):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,textcat_config", "name,textcat_config",
[ [
# BOW # BOW V1
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
@ -451,11 +451,11 @@ def test_no_resize(name, textcat_config):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,textcat_config", "name,textcat_config",
[ [
# BOW # BOW V3
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
# CNN # CNN
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
@ -480,11 +480,11 @@ def test_resize(name, textcat_config):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,textcat_config", "name,textcat_config",
[ [
# BOW # BOW v3
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
# CNN # CNN
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
@ -693,9 +693,14 @@ def test_overfitting_IO_multi():
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
# BOW V3
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
# ENSEMBLE V2 # ENSEMBLE V2
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
# CNN V2 # CNN V2
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),

View File

@ -238,7 +238,7 @@ def test_project_push_pull(project_dir):
def test_find_function_valid(): def test_find_function_valid():
# example of architecture in main code base # example of architecture in main code base
function = "spacy.TextCatBOW.v2" function = "spacy.TextCatBOW.v3"
result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"]) result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
assert f"Found registered function '{function}'" in result.stdout assert f"Found registered function '{function}'" in result.stdout
assert "textcat.py" in result.stdout assert "textcat.py" in result.stdout
@ -257,7 +257,7 @@ def test_find_function_valid():
def test_find_function_invalid(): def test_find_function_invalid():
# invalid registry # invalid registry
function = "spacy.TextCatBOW.v2" function = "spacy.TextCatBOW.v3"
registry = "foobar" registry = "foobar"
result = CliRunner().invoke( result = CliRunner().invoke(
app, ["find-function", function, "--registry", registry] app, ["find-function", function, "--registry", registry]

View File

@ -376,8 +376,9 @@ def test_util_dot_section():
factory = "textcat" factory = "textcat"
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """

View File

@ -79,7 +79,7 @@ subword features, and a
consisting of a CNN and a layer-normalized maxout activation function. consisting of a CNN and a layer-normalized maxout activation function.
| Name | Description | | Name | Description |
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ | | `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ | | `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ | | `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
@ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the
> nO = null > nO = null
> >
> [model.linear_model] > [model.linear_model]
> @architectures = "spacy.TextCatBOW.v2" > @architectures = "spacy.TextCatBOW.v3"
> exclusive_classes = true > exclusive_classes = true
> length = 262144
> ngram_size = 1 > ngram_size = 1
> no_output_layer = false > no_output_layer = false
> >
@ -1057,14 +1058,15 @@ after training.
</Accordion> </Accordion>
### spacy.TextCatBOW.v2 {id="TextCatBOW"} ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
> #### Example Config > #### Example Config
> >
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.TextCatBOW.v2" > @architectures = "spacy.TextCatBOW.v3"
> exclusive_classes = false > exclusive_classes = false
> length = 262144
> ngram_size = 1 > ngram_size = 1
> no_output_layer = false > no_output_layer = false
> nO = null > nO = null
@ -1078,14 +1080,19 @@ the others, but may not be as accurate, especially if texts are short.
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ | | `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
| `length` | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
<Accordion title="spacy.TextCatBOW.v1 definition" spaced> <Accordion title="Previous versions of spacy.TextCatBOW" spaced>
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2,
not yet resizable. Since v2, new labels can be added to this component, even new labels can be added to this component, even after training.
after training. - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear
layer that only used a small number of the allocated parameters.
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument.
</Accordion> </Accordion>

View File

@ -400,6 +400,14 @@ identifiers are grouped by token. Instances of this class are typically assigned
to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
attribute. attribute.
> #### Example
>
> ```python
> # Get the last hidden layer output for "is" (token index 1)
> doc = nlp("This is a text.")
> tensors = doc._.trf_data.last_hidden_layer_state[1]
> ```
| Name | Description | | Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `all_outputs` | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ | | `all_outputs` | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |

View File

@ -198,7 +198,9 @@ architecture is usually less accurate than the ensemble, but runs faster.
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
yet support that. yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an
erroneous sparse linear layer that only used a small number of the allocated
parameters.
> #### Example Config > #### Example Config
> >
@ -222,6 +224,33 @@ the others, but may not be as accurate, especially if texts are short.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse
linear layer that only used a small number of the allocated parameters.
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatBOW.v2"
> exclusive_classes = false
> ngram_size = 1
> no_output_layer = false
> nO = null
> ```
An n-gram "bag-of-words" model. This architecture should run much faster than
the others, but may not be as accurate, especially if texts are short.
| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"} ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
Identical to Identical to

View File

@ -397,6 +397,17 @@ are wrapped into the
by this class. Instances of this class are typically assigned to the by this class. Instances of this class are typically assigned to the
[`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute. [`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute.
> #### Example
>
> ```python
> # Get the last hidden layer output for "is" (token index 1)
> doc = nlp("This is a text.")
> indices = doc._.trf_data.align[1].data.flatten()
> last_hidden_state = doc._.trf_data.model_output.last_hidden_state
> dim = last_hidden_state.shape[-1]
> tensors = last_hidden_state.reshape(-1, dim)[indices]
> ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ | | `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |

View File

@ -108,12 +108,12 @@ In the `sm`/`md`/`lg` models:
#### CNN/CPU pipelines with floret vectors #### CNN/CPU pipelines with floret vectors
The Finnish, Korean and Swedish `md` and `lg` pipelines use The Croatian, Finnish, Korean, Slovenian, Swedish and Ukrainian `md` and `lg`
[floret vectors](/usage/v3-2#vectors) instead of default vectors. If you're pipelines use [floret vectors](/usage/v3-2#vectors) instead of default vectors.
running a trained pipeline on texts and working with [`Doc`](/api/doc) objects, If you're running a trained pipeline on texts and working with [`Doc`](/api/doc)
you shouldn't notice any difference with floret vectors. With floret vectors no objects, you shouldn't notice any difference with floret vectors. With floret
tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will vectors no tokens are out-of-vocabulary, so
return `False` for all tokens. [`Token.is_oov`](/api/token#attributes) will return `False` for all tokens.
If you access vectors directly for similarity comparisons, there are a few If you access vectors directly for similarity comparisons, there are a few
differences because floret vectors don't include a fixed word list like the differences because floret vectors don't include a fixed word list like the
@ -132,10 +132,20 @@ vector keys for default vectors.
### Transformer pipeline design {id="design-trf"} ### Transformer pipeline design {id="design-trf"}
In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present) In the transformer (`trf`) pipelines, the `tagger`, `parser` and `ner` (if
all listen to the `transformer` component. The `attribute_ruler` and present) all listen to the `transformer` component. The `attribute_ruler` and
`lemmatizer` have the same configuration as in the CNN models. `lemmatizer` have the same configuration as in the CNN models.
For spaCy v3.0-v3.6, `trf` pipelines use
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) and the
transformer output in `doc._.trf_data` is a
[`TransformerData`](/api/transformer#transformerdata) object.
For spaCy v3.7+, `trf` pipelines use
[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers)
and `doc._.trf_data` is a
[`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object.
### Modifying the default pipeline {id="design-modify"} ### Modifying the default pipeline {id="design-modify"}
For faster processing, you may only want to run a subset of the components in a For faster processing, you may only want to run a subset of the components in a

View File

@ -153,8 +153,9 @@ maxout_pieces = 3
depth = 2 depth = 2
[components.textcat.model.linear_model] [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
``` ```
@ -170,8 +171,9 @@ factory = "textcat"
labels = [] labels = []
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
nO = null nO = null

View File

@ -1328,8 +1328,9 @@ labels = []
# This function is created and then passed to the "textcat" component as # This function is created and then passed to the "textcat" component as
# the argument "model" # the argument "model"
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false

View File

@ -103,6 +103,10 @@
"has_examples": true, "has_examples": true,
"models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"] "models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"]
}, },
{
"code": "fo",
"name": "Faroese"
},
{ {
"code": "fr", "code": "fr",
"name": "French", "name": "French",
@ -290,6 +294,12 @@
"example": "Dit is een zin.", "example": "Dit is een zin.",
"has_examples": true "has_examples": true
}, },
{
"code": "nn",
"name": "Norwegian Nynorsk",
"example": "Det er ein meir enn i same periode i fjor.",
"has_examples": true
},
{ {
"code": "pl", "code": "pl",
"name": "Polish", "name": "Polish",

View File

@ -66,6 +66,10 @@
{ {
"text": "Stack Overflow", "text": "Stack Overflow",
"url": "http://stackoverflow.com/questions/tagged/spacy" "url": "http://stackoverflow.com/questions/tagged/spacy"
},
{
"text": "Merchandise",
"url": "https://explosion.ai/merch"
} }
] ]
}, },

View File

@ -4500,6 +4500,23 @@
"website": "https://nlp.unibuc.ro/people/snisioi.html" "website": "https://nlp.unibuc.ro/people/snisioi.html"
}, },
"category": ["pipeline", "training", "models"] "category": ["pipeline", "training", "models"]
},
{
"id": "redfield-spacy-nodes",
"title": "Redfield NLP Nodes for KNIME",
"slogan": "Makes the functionality of the spaCy library available in KNIME Analytics Platform.",
"description": "This extension provides nodes that make the functionality of the spaCy library available in the [KNIME Analytics Platform](https://www.knime.com/).",
"github": "Redfield-AB/Spacy-Nodes",
"url": "https://redfield.ai/spacy-redfield/",
"thumb": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/redfield_logo_100x100.png",
"image": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/screen1.png",
"author": "Redfield AB",
"author_links": {
"twitter": "Redfield_AB",
"github": "Redfield-AB",
"website": "https://redfield.ai"
},
"category": ["standalone"]
} }
], ],