mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-30 02:33:07 +03:00
Merge remote-tracking branch 'origin/fix/types' into fix/types
This commit is contained in:
commit
a2c9eeca0b
1
.github/FUNDING.yml
vendored
Normal file
1
.github/FUNDING.yml
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
custom: https://explosion.ai/merch
|
|
@ -46,6 +46,7 @@ open-source software, released under the
|
|||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||
|
||||
|
@ -61,6 +62,7 @@ open-source software, released under the
|
|||
[project templates]: https://github.com/explosion/projects
|
||||
[changelog]: https://spacy.io/usage#changelog
|
||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||
[swag]: https://explosion.ai/merch
|
||||
|
||||
## 💬 Where to ask questions
|
||||
|
||||
|
|
|
@ -271,8 +271,9 @@ grad_factor = 1.0
|
|||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = true
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
|
@ -308,8 +309,9 @@ grad_factor = 1.0
|
|||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.textcat_multilabel.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = false
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
|
@ -542,14 +544,15 @@ nO = null
|
|||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = true
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
@ -570,15 +573,17 @@ nO = null
|
|||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.textcat_multilabel.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = false
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = false
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
{%- endif %}
|
||||
|
|
|
@ -983,6 +983,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"predicted docs when training {component}.")
|
||||
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
|
||||
"but only callbacks with one or three parameters are supported")
|
||||
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
18
spacy/lang/fo/__init__.py
Normal file
18
spacy/lang/fo/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class FaroeseDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
|
||||
|
||||
class Faroese(Language):
|
||||
lang = "fo"
|
||||
Defaults = FaroeseDefaults
|
||||
|
||||
|
||||
__all__ = ["Faroese"]
|
90
spacy/lang/fo/tokenizer_exceptions.py
Normal file
90
spacy/lang/fo/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
from ...symbols import ORTH
|
||||
from ...util import update_exc
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_exc = {}
|
||||
|
||||
for orth in [
|
||||
"apr.",
|
||||
"aug.",
|
||||
"avgr.",
|
||||
"árg.",
|
||||
"ávís.",
|
||||
"beinl.",
|
||||
"blkv.",
|
||||
"blaðkv.",
|
||||
"blm.",
|
||||
"blaðm.",
|
||||
"bls.",
|
||||
"blstj.",
|
||||
"blaðstj.",
|
||||
"des.",
|
||||
"eint.",
|
||||
"febr.",
|
||||
"fyrrv.",
|
||||
"góðk.",
|
||||
"h.m.",
|
||||
"innt.",
|
||||
"jan.",
|
||||
"kl.",
|
||||
"m.a.",
|
||||
"mðr.",
|
||||
"mió.",
|
||||
"nr.",
|
||||
"nto.",
|
||||
"nov.",
|
||||
"nút.",
|
||||
"o.a.",
|
||||
"o.a.m.",
|
||||
"o.a.tíl.",
|
||||
"o.fl.",
|
||||
"ff.",
|
||||
"o.m.a.",
|
||||
"o.o.",
|
||||
"o.s.fr.",
|
||||
"o.tíl.",
|
||||
"o.ø.",
|
||||
"okt.",
|
||||
"omf.",
|
||||
"pst.",
|
||||
"ritstj.",
|
||||
"sbr.",
|
||||
"sms.",
|
||||
"smst.",
|
||||
"smb.",
|
||||
"sb.",
|
||||
"sbrt.",
|
||||
"sp.",
|
||||
"sept.",
|
||||
"spf.",
|
||||
"spsk.",
|
||||
"t.e.",
|
||||
"t.s.",
|
||||
"t.s.s.",
|
||||
"tlf.",
|
||||
"tel.",
|
||||
"tsk.",
|
||||
"t.o.v.",
|
||||
"t.d.",
|
||||
"uml.",
|
||||
"ums.",
|
||||
"uppl.",
|
||||
"upprfr.",
|
||||
"uppr.",
|
||||
"útg.",
|
||||
"útl.",
|
||||
"útr.",
|
||||
"vanl.",
|
||||
"v.",
|
||||
"v.h.",
|
||||
"v.ø.o.",
|
||||
"viðm.",
|
||||
"viðv.",
|
||||
"vm.",
|
||||
"v.m.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
capitalized = orth.capitalize()
|
||||
_exc[capitalized] = [{ORTH: capitalized}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
20
spacy/lang/nn/__init__.py
Normal file
20
spacy/lang/nn/__init__.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from ..nb import SYNTAX_ITERATORS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class NorwegianNynorskDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class NorwegianNynorsk(Language):
|
||||
lang = "nn"
|
||||
Defaults = NorwegianNynorskDefaults
|
||||
|
||||
|
||||
__all__ = ["NorwegianNynorsk"]
|
15
spacy/lang/nn/examples.py
Normal file
15
spacy/lang/nn/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.nn.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
# sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
|
||||
sentences = [
|
||||
"Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
|
||||
"Det er ein meir enn i same periode i fjor.",
|
||||
"Det har lava ned enorme snømengder i store delar av Europa den siste tida.",
|
||||
"Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.",
|
||||
]
|
74
spacy/lang/nn/punctuation.py
Normal file
74
spacy/lang/nn/punctuation.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
CURRENCY,
|
||||
LIST_CURRENCY,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
PUNCT,
|
||||
UNITS,
|
||||
)
|
||||
from ..punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
_list_punct = [x for x in LIST_PUNCT if x != "#"]
|
||||
_list_icons = [x for x in LIST_ICONS if x != "°"]
|
||||
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
|
||||
_list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
|
||||
|
||||
|
||||
_prefixes = (
|
||||
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
|
||||
+ _list_punct
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_CURRENCY
|
||||
+ LIST_ICONS
|
||||
)
|
||||
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ _list_icons
|
||||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ _list_quotes
|
||||
+ _list_icons
|
||||
+ ["—", "–"]
|
||||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
|
||||
),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
]
|
||||
+ [r"(?<=[^sSxXzZ])'"]
|
||||
)
|
||||
_suffixes += [
|
||||
suffix
|
||||
for suffix in TOKENIZER_SUFFIXES
|
||||
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
||||
]
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
228
spacy/lang/nn/tokenizer_exceptions.py
Normal file
228
spacy/lang/nn/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,228 @@
|
|||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "jan.", NORM: "januar"},
|
||||
{ORTH: "feb.", NORM: "februar"},
|
||||
{ORTH: "mar.", NORM: "mars"},
|
||||
{ORTH: "apr.", NORM: "april"},
|
||||
{ORTH: "jun.", NORM: "juni"},
|
||||
# note: "jul." is in the simple list below without a NORM exception
|
||||
{ORTH: "aug.", NORM: "august"},
|
||||
{ORTH: "sep.", NORM: "september"},
|
||||
{ORTH: "okt.", NORM: "oktober"},
|
||||
{ORTH: "nov.", NORM: "november"},
|
||||
{ORTH: "des.", NORM: "desember"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
||||
for orth in [
|
||||
"Ap.",
|
||||
"Aq.",
|
||||
"Ca.",
|
||||
"Chr.",
|
||||
"Co.",
|
||||
"Dr.",
|
||||
"F.eks.",
|
||||
"Fr.p.",
|
||||
"Frp.",
|
||||
"Grl.",
|
||||
"Kr.",
|
||||
"Kr.F.",
|
||||
"Kr.F.s",
|
||||
"Mr.",
|
||||
"Mrs.",
|
||||
"Pb.",
|
||||
"Pr.",
|
||||
"Sp.",
|
||||
"St.",
|
||||
"a.m.",
|
||||
"ad.",
|
||||
"adm.dir.",
|
||||
"adr.",
|
||||
"b.c.",
|
||||
"bl.a.",
|
||||
"bla.",
|
||||
"bm.",
|
||||
"bnr.",
|
||||
"bto.",
|
||||
"c.c.",
|
||||
"ca.",
|
||||
"cand.mag.",
|
||||
"co.",
|
||||
"d.d.",
|
||||
"d.m.",
|
||||
"d.y.",
|
||||
"dept.",
|
||||
"dr.",
|
||||
"dr.med.",
|
||||
"dr.philos.",
|
||||
"dr.psychol.",
|
||||
"dss.",
|
||||
"dvs.",
|
||||
"e.Kr.",
|
||||
"e.l.",
|
||||
"eg.",
|
||||
"eig.",
|
||||
"ekskl.",
|
||||
"el.",
|
||||
"et.",
|
||||
"etc.",
|
||||
"etg.",
|
||||
"ev.",
|
||||
"evt.",
|
||||
"f.",
|
||||
"f.Kr.",
|
||||
"f.eks.",
|
||||
"f.o.m.",
|
||||
"fhv.",
|
||||
"fk.",
|
||||
"foreg.",
|
||||
"fork.",
|
||||
"fv.",
|
||||
"fvt.",
|
||||
"g.",
|
||||
"gl.",
|
||||
"gno.",
|
||||
"gnr.",
|
||||
"grl.",
|
||||
"gt.",
|
||||
"h.r.adv.",
|
||||
"hhv.",
|
||||
"hoh.",
|
||||
"hr.",
|
||||
"ifb.",
|
||||
"ifm.",
|
||||
"iht.",
|
||||
"inkl.",
|
||||
"istf.",
|
||||
"jf.",
|
||||
"jr.",
|
||||
"jul.",
|
||||
"juris.",
|
||||
"kfr.",
|
||||
"kgl.",
|
||||
"kgl.res.",
|
||||
"kl.",
|
||||
"komm.",
|
||||
"kr.",
|
||||
"kst.",
|
||||
"lat.",
|
||||
"lø.",
|
||||
"m.a.",
|
||||
"m.a.o.",
|
||||
"m.fl.",
|
||||
"m.m.",
|
||||
"m.v.",
|
||||
"ma.",
|
||||
"mag.art.",
|
||||
"md.",
|
||||
"mfl.",
|
||||
"mht.",
|
||||
"mill.",
|
||||
"min.",
|
||||
"mnd.",
|
||||
"moh.",
|
||||
"mrd.",
|
||||
"muh.",
|
||||
"mv.",
|
||||
"mva.",
|
||||
"n.å.",
|
||||
"ndf.",
|
||||
"nr.",
|
||||
"nto.",
|
||||
"nyno.",
|
||||
"o.a.",
|
||||
"o.l.",
|
||||
"obl.",
|
||||
"off.",
|
||||
"ofl.",
|
||||
"on.",
|
||||
"op.",
|
||||
"org.",
|
||||
"osv.",
|
||||
"ovf.",
|
||||
"p.",
|
||||
"p.a.",
|
||||
"p.g.a.",
|
||||
"p.m.",
|
||||
"p.t.",
|
||||
"pga.",
|
||||
"ph.d.",
|
||||
"pkt.",
|
||||
"pr.",
|
||||
"pst.",
|
||||
"pt.",
|
||||
"red.anm.",
|
||||
"ref.",
|
||||
"res.",
|
||||
"res.kap.",
|
||||
"resp.",
|
||||
"rv.",
|
||||
"s.",
|
||||
"s.d.",
|
||||
"s.k.",
|
||||
"s.u.",
|
||||
"s.å.",
|
||||
"sen.",
|
||||
"sep.",
|
||||
"siviling.",
|
||||
"sms.",
|
||||
"snr.",
|
||||
"spm.",
|
||||
"sr.",
|
||||
"sst.",
|
||||
"st.",
|
||||
"st.meld.",
|
||||
"st.prp.",
|
||||
"stip.",
|
||||
"stk.",
|
||||
"stud.",
|
||||
"sv.",
|
||||
"såk.",
|
||||
"sø.",
|
||||
"t.d.",
|
||||
"t.h.",
|
||||
"t.o.m.",
|
||||
"t.v.",
|
||||
"temp.",
|
||||
"ti.",
|
||||
"tils.",
|
||||
"tilsv.",
|
||||
"tl;dr",
|
||||
"tlf.",
|
||||
"to.",
|
||||
"ult.",
|
||||
"utg.",
|
||||
"v.",
|
||||
"vedk.",
|
||||
"vedr.",
|
||||
"vg.",
|
||||
"vgs.",
|
||||
"vha.",
|
||||
"vit.ass.",
|
||||
"vn.",
|
||||
"vol.",
|
||||
"vs.",
|
||||
"vsa.",
|
||||
"§§",
|
||||
"©NTB",
|
||||
"årg.",
|
||||
"årh.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
# Dates
|
||||
for h in range(1, 31 + 1):
|
||||
for period in ["."]:
|
||||
_exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
|
||||
|
||||
_custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
|
||||
_exc.update(_custom_base_exc)
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -1,5 +1,5 @@
|
|||
from functools import partial
|
||||
from typing import List, Optional, cast
|
||||
from typing import List, Optional, Tuple, cast
|
||||
|
||||
from thinc.api import (
|
||||
Dropout,
|
||||
|
@ -12,6 +12,7 @@ from thinc.api import (
|
|||
Relu,
|
||||
Softmax,
|
||||
SparseLinear,
|
||||
SparseLinear_v2,
|
||||
chain,
|
||||
clone,
|
||||
concatenate,
|
||||
|
@ -25,9 +26,10 @@ from thinc.api import (
|
|||
)
|
||||
from thinc.layers.chain import init as init_chain
|
||||
from thinc.layers.resizable import resize_linear_weighted, resize_model
|
||||
from thinc.types import Floats2d
|
||||
from thinc.types import ArrayXd, Floats2d
|
||||
|
||||
from ...attrs import ORTH
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc
|
||||
from ...util import registry
|
||||
from ..extract_ngrams import extract_ngrams
|
||||
|
@ -95,10 +97,48 @@ def build_bow_text_classifier(
|
|||
ngram_size: int,
|
||||
no_output_layer: bool,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
return _build_bow_text_classifier(
|
||||
exclusive_classes=exclusive_classes,
|
||||
ngram_size=ngram_size,
|
||||
no_output_layer=no_output_layer,
|
||||
nO=nO,
|
||||
sparse_linear=SparseLinear(nO=nO),
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures("spacy.TextCatBOW.v3")
|
||||
def build_bow_text_classifier_v3(
|
||||
exclusive_classes: bool,
|
||||
ngram_size: int,
|
||||
no_output_layer: bool,
|
||||
length: int = 262144,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
if length < 1:
|
||||
raise ValueError(Errors.E1056.format(length=length))
|
||||
|
||||
# Find k such that 2**(k-1) < length <= 2**k.
|
||||
length = 2 ** (length - 1).bit_length()
|
||||
|
||||
return _build_bow_text_classifier(
|
||||
exclusive_classes=exclusive_classes,
|
||||
ngram_size=ngram_size,
|
||||
no_output_layer=no_output_layer,
|
||||
nO=nO,
|
||||
sparse_linear=SparseLinear_v2(nO=nO, length=length),
|
||||
)
|
||||
|
||||
|
||||
def _build_bow_text_classifier(
|
||||
exclusive_classes: bool,
|
||||
ngram_size: int,
|
||||
no_output_layer: bool,
|
||||
sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd],
|
||||
nO: Optional[int] = None,
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
fill_defaults = {"b": 0, "W": 0}
|
||||
with Model.define_operators({">>": chain}):
|
||||
sparse_linear = SparseLinear(nO=nO)
|
||||
output_layer = None
|
||||
if not no_output_layer:
|
||||
fill_defaults["b"] = NEG_VALUE
|
||||
|
|
|
@ -36,8 +36,9 @@ maxout_pieces = 3
|
|||
depth = 2
|
||||
|
||||
[model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = true
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
"""
|
||||
|
@ -45,8 +46,9 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m
|
|||
|
||||
single_label_bow_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = true
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
"""
|
||||
|
|
|
@ -35,8 +35,9 @@ maxout_pieces = 3
|
|||
depth = 2
|
||||
|
||||
[model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = false
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
"""
|
||||
|
@ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod
|
|||
|
||||
multi_label_bow_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
|
|
@ -162,6 +162,11 @@ def fi_tokenizer():
|
|||
return get_lang_class("fi")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def fo_tokenizer():
|
||||
return get_lang_class("fo")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def fr_tokenizer():
|
||||
return get_lang_class("fr")().tokenizer
|
||||
|
@ -317,6 +322,11 @@ def nl_tokenizer():
|
|||
return get_lang_class("nl")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def nn_tokenizer():
|
||||
return get_lang_class("nn")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def pl_tokenizer():
|
||||
return get_lang_class("pl")().tokenizer
|
||||
|
|
0
spacy/tests/lang/fo/__init__.py
Normal file
0
spacy/tests/lang/fo/__init__.py
Normal file
26
spacy/tests/lang/fo/test_tokenizer.py
Normal file
26
spacy/tests/lang/fo/test_tokenizer.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
import pytest
|
||||
|
||||
# examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/)
|
||||
# fmt: off
|
||||
FO_TOKEN_EXCEPTION_TESTS = [
|
||||
(
|
||||
"Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ",
|
||||
[
|
||||
"Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", ".",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.",
|
||||
[
|
||||
"Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "tá", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", ".",
|
||||
],
|
||||
),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS)
|
||||
def test_fo_tokenizer_handles_exception_cases(fo_tokenizer, text, expected_tokens):
|
||||
tokens = fo_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
0
spacy/tests/lang/nn/__init__.py
Normal file
0
spacy/tests/lang/nn/__init__.py
Normal file
38
spacy/tests/lang/nn/test_tokenizer.py
Normal file
38
spacy/tests/lang/nn/test_tokenizer.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
import pytest
|
||||
|
||||
# examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
|
||||
# fmt: off
|
||||
NN_TOKEN_EXCEPTION_TESTS = [
|
||||
(
|
||||
"Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.",
|
||||
[
|
||||
"Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "så", "raskt", "som", "mogleg", "i", "2014", ".",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.",
|
||||
[
|
||||
"Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "på", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "på", "skulen", "ueinig", "i", ".",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.",
|
||||
[
|
||||
"Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", ".",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Brukssesongen er frå nov. til mai, med ein topp i mars.",
|
||||
[
|
||||
"Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", ".",
|
||||
],
|
||||
),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS)
|
||||
def test_nn_tokenizer_handles_exception_cases(nn_tokenizer, text, expected_tokens):
|
||||
tokens = nn_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
|
@ -203,7 +203,7 @@ def test_pipe_class_component_model():
|
|||
"@architectures": "spacy.TextCatEnsemble.v2",
|
||||
"tok2vec": DEFAULT_TOK2VEC_MODEL,
|
||||
"linear_model": {
|
||||
"@architectures": "spacy.TextCatBOW.v2",
|
||||
"@architectures": "spacy.TextCatBOW.v3",
|
||||
"exclusive_classes": False,
|
||||
"ngram_size": 1,
|
||||
"no_output_layer": False,
|
||||
|
|
|
@ -414,7 +414,7 @@ def test_implicit_label(name, get_examples):
|
|||
@pytest.mark.parametrize(
|
||||
"name,textcat_config",
|
||||
[
|
||||
# BOW
|
||||
# BOW V1
|
||||
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||
|
@ -451,11 +451,11 @@ def test_no_resize(name, textcat_config):
|
|||
@pytest.mark.parametrize(
|
||||
"name,textcat_config",
|
||||
[
|
||||
# BOW
|
||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
||||
# BOW V3
|
||||
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
||||
# CNN
|
||||
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
||||
|
@ -480,11 +480,11 @@ def test_resize(name, textcat_config):
|
|||
@pytest.mark.parametrize(
|
||||
"name,textcat_config",
|
||||
[
|
||||
# BOW
|
||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
||||
# BOW v3
|
||||
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
|
||||
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
|
||||
# CNN
|
||||
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
||||
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
||||
|
@ -693,9 +693,14 @@ def test_overfitting_IO_multi():
|
|||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
|
||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
|
||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
|
||||
# BOW V3
|
||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
|
||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
|
||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
|
||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
|
||||
# ENSEMBLE V2
|
||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
|
||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
|
||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
|
||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
|
||||
# CNN V2
|
||||
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
|
||||
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
|
||||
|
|
|
@ -238,7 +238,7 @@ def test_project_push_pull(project_dir):
|
|||
|
||||
def test_find_function_valid():
|
||||
# example of architecture in main code base
|
||||
function = "spacy.TextCatBOW.v2"
|
||||
function = "spacy.TextCatBOW.v3"
|
||||
result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
|
||||
assert f"Found registered function '{function}'" in result.stdout
|
||||
assert "textcat.py" in result.stdout
|
||||
|
@ -257,7 +257,7 @@ def test_find_function_valid():
|
|||
|
||||
def test_find_function_invalid():
|
||||
# invalid registry
|
||||
function = "spacy.TextCatBOW.v2"
|
||||
function = "spacy.TextCatBOW.v3"
|
||||
registry = "foobar"
|
||||
result = CliRunner().invoke(
|
||||
app, ["find-function", function, "--registry", registry]
|
||||
|
|
|
@ -376,8 +376,9 @@ def test_util_dot_section():
|
|||
factory = "textcat"
|
||||
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = true
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
"""
|
||||
|
|
|
@ -78,16 +78,16 @@ subword features, and a
|
|||
[MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer
|
||||
consisting of a CNN and a layer-normalized maxout activation function.
|
||||
|
||||
| Name | Description |
|
||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
||||
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||
| Name | Description |
|
||||
| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
||||
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
||||
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
||||
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
||||
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.Tok2VecListener.v1 {id="Tok2VecListener"}
|
||||
|
||||
|
@ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the
|
|||
> nO = null
|
||||
>
|
||||
> [model.linear_model]
|
||||
> @architectures = "spacy.TextCatBOW.v2"
|
||||
> @architectures = "spacy.TextCatBOW.v3"
|
||||
> exclusive_classes = true
|
||||
> length = 262144
|
||||
> ngram_size = 1
|
||||
> no_output_layer = false
|
||||
>
|
||||
|
@ -1057,14 +1058,15 @@ after training.
|
|||
|
||||
</Accordion>
|
||||
|
||||
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
|
||||
### spacy.TextCatBOW.v3 {id="TextCatBOW"}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.TextCatBOW.v2"
|
||||
> @architectures = "spacy.TextCatBOW.v3"
|
||||
> exclusive_classes = false
|
||||
> length = 262144
|
||||
> ngram_size = 1
|
||||
> no_output_layer = false
|
||||
> nO = null
|
||||
|
@ -1078,14 +1080,19 @@ the others, but may not be as accurate, especially if texts are short.
|
|||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
|
||||
| `length` | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
<Accordion title="spacy.TextCatBOW.v1 definition" spaced>
|
||||
<Accordion title="Previous versions of spacy.TextCatBOW" spaced>
|
||||
|
||||
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was
|
||||
not yet resizable. Since v2, new labels can be added to this component, even
|
||||
after training.
|
||||
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2,
|
||||
new labels can be added to this component, even after training.
|
||||
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
|
||||
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear
|
||||
layer that only used a small number of the allocated parameters.
|
||||
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
|
||||
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument.
|
||||
|
||||
</Accordion>
|
||||
|
||||
|
|
|
@ -400,6 +400,14 @@ identifiers are grouped by token. Instances of this class are typically assigned
|
|||
to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
|
||||
attribute.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Get the last hidden layer output for "is" (token index 1)
|
||||
> doc = nlp("This is a text.")
|
||||
> tensors = doc._.trf_data.last_hidden_layer_state[1]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `all_outputs` | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |
|
||||
|
|
|
@ -198,7 +198,9 @@ architecture is usually less accurate than the ensemble, but runs faster.
|
|||
|
||||
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
|
||||
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
|
||||
yet support that.
|
||||
yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an
|
||||
erroneous sparse linear layer that only used a small number of the allocated
|
||||
parameters.
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
|
@ -222,6 +224,33 @@ the others, but may not be as accurate, especially if texts are short.
|
|||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
|
||||
|
||||
Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse
|
||||
linear layer that only used a small number of the allocated parameters.
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.TextCatBOW.v2"
|
||||
> exclusive_classes = false
|
||||
> ngram_size = 1
|
||||
> no_output_layer = false
|
||||
> nO = null
|
||||
> ```
|
||||
|
||||
An n-gram "bag-of-words" model. This architecture should run much faster than
|
||||
the others, but may not be as accurate, especially if texts are short.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
|
||||
|
||||
Identical to
|
||||
|
|
|
@ -397,6 +397,17 @@ are wrapped into the
|
|||
by this class. Instances of this class are typically assigned to the
|
||||
[`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Get the last hidden layer output for "is" (token index 1)
|
||||
> doc = nlp("This is a text.")
|
||||
> indices = doc._.trf_data.align[1].data.flatten()
|
||||
> last_hidden_state = doc._.trf_data.model_output.last_hidden_state
|
||||
> dim = last_hidden_state.shape[-1]
|
||||
> tensors = last_hidden_state.reshape(-1, dim)[indices]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
|
||||
|
|
|
@ -108,12 +108,12 @@ In the `sm`/`md`/`lg` models:
|
|||
|
||||
#### CNN/CPU pipelines with floret vectors
|
||||
|
||||
The Finnish, Korean and Swedish `md` and `lg` pipelines use
|
||||
[floret vectors](/usage/v3-2#vectors) instead of default vectors. If you're
|
||||
running a trained pipeline on texts and working with [`Doc`](/api/doc) objects,
|
||||
you shouldn't notice any difference with floret vectors. With floret vectors no
|
||||
tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will
|
||||
return `False` for all tokens.
|
||||
The Croatian, Finnish, Korean, Slovenian, Swedish and Ukrainian `md` and `lg`
|
||||
pipelines use [floret vectors](/usage/v3-2#vectors) instead of default vectors.
|
||||
If you're running a trained pipeline on texts and working with [`Doc`](/api/doc)
|
||||
objects, you shouldn't notice any difference with floret vectors. With floret
|
||||
vectors no tokens are out-of-vocabulary, so
|
||||
[`Token.is_oov`](/api/token#attributes) will return `False` for all tokens.
|
||||
|
||||
If you access vectors directly for similarity comparisons, there are a few
|
||||
differences because floret vectors don't include a fixed word list like the
|
||||
|
@ -132,10 +132,20 @@ vector keys for default vectors.
|
|||
|
||||
### Transformer pipeline design {id="design-trf"}
|
||||
|
||||
In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
|
||||
all listen to the `transformer` component. The `attribute_ruler` and
|
||||
In the transformer (`trf`) pipelines, the `tagger`, `parser` and `ner` (if
|
||||
present) all listen to the `transformer` component. The `attribute_ruler` and
|
||||
`lemmatizer` have the same configuration as in the CNN models.
|
||||
|
||||
For spaCy v3.0-v3.6, `trf` pipelines use
|
||||
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) and the
|
||||
transformer output in `doc._.trf_data` is a
|
||||
[`TransformerData`](/api/transformer#transformerdata) object.
|
||||
|
||||
For spaCy v3.7+, `trf` pipelines use
|
||||
[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers)
|
||||
and `doc._.trf_data` is a
|
||||
[`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object.
|
||||
|
||||
### Modifying the default pipeline {id="design-modify"}
|
||||
|
||||
For faster processing, you may only want to run a subset of the components in a
|
||||
|
|
|
@ -153,8 +153,9 @@ maxout_pieces = 3
|
|||
depth = 2
|
||||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = true
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
```
|
||||
|
@ -170,8 +171,9 @@ factory = "textcat"
|
|||
labels = []
|
||||
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = true
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
|
|
@ -1328,8 +1328,9 @@ labels = []
|
|||
# This function is created and then passed to the "textcat" component as
|
||||
# the argument "model"
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
exclusive_classes = true
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
|
|
|
@ -103,6 +103,10 @@
|
|||
"has_examples": true,
|
||||
"models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"]
|
||||
},
|
||||
{
|
||||
"code": "fo",
|
||||
"name": "Faroese"
|
||||
},
|
||||
{
|
||||
"code": "fr",
|
||||
"name": "French",
|
||||
|
@ -290,6 +294,12 @@
|
|||
"example": "Dit is een zin.",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "nn",
|
||||
"name": "Norwegian Nynorsk",
|
||||
"example": "Det er ein meir enn i same periode i fjor.",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "pl",
|
||||
"name": "Polish",
|
||||
|
|
|
@ -66,6 +66,10 @@
|
|||
{
|
||||
"text": "Stack Overflow",
|
||||
"url": "http://stackoverflow.com/questions/tagged/spacy"
|
||||
},
|
||||
{
|
||||
"text": "Merchandise",
|
||||
"url": "https://explosion.ai/merch"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
|
|
@ -4500,6 +4500,23 @@
|
|||
"website": "https://nlp.unibuc.ro/people/snisioi.html"
|
||||
},
|
||||
"category": ["pipeline", "training", "models"]
|
||||
},
|
||||
{
|
||||
"id": "redfield-spacy-nodes",
|
||||
"title": "Redfield NLP Nodes for KNIME",
|
||||
"slogan": "Makes the functionality of the spaCy library available in KNIME Analytics Platform.",
|
||||
"description": "This extension provides nodes that make the functionality of the spaCy library available in the [KNIME Analytics Platform](https://www.knime.com/).",
|
||||
"github": "Redfield-AB/Spacy-Nodes",
|
||||
"url": "https://redfield.ai/spacy-redfield/",
|
||||
"thumb": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/redfield_logo_100x100.png",
|
||||
"image": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/screen1.png",
|
||||
"author": "Redfield AB",
|
||||
"author_links": {
|
||||
"twitter": "Redfield_AB",
|
||||
"github": "Redfield-AB",
|
||||
"website": "https://redfield.ai"
|
||||
},
|
||||
"category": ["standalone"]
|
||||
}
|
||||
],
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user