Merge pull request #11448 from shadeMe/merge-develop-into-v4

Merge `develop` into `v4`
This commit is contained in:
Madeesh Kannan 2022-09-07 13:26:11 +02:00 committed by GitHub
commit 60c050e82b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 752 additions and 208 deletions

View File

@ -6,7 +6,6 @@ requires = [
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.1.0,<8.2.0", "thinc>=8.1.0,<8.2.0",
"pathy",
"numpy>=1.15.0", "numpy>=1.15.0",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"

View File

@ -1,5 +1,5 @@
# Our libraries # Our libraries
spacy-legacy>=3.0.9,<3.1.0 spacy-legacy>=3.0.10,<3.1.0
spacy-loggers>=1.0.0,<2.0.0 spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
@ -34,4 +34,5 @@ mypy>=0.910,<0.970; platform_machine!='aarch64'
types-dataclasses>=0.1.3; python_version < "3.7" types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1 types-mock>=0.1.1
types-requests types-requests
types-setuptools>=57.0.0
black>=22.0,<23.0 black>=22.0,<23.0

View File

@ -33,7 +33,7 @@ include_package_data = true
python_requires = >=3.6 python_requires = >=3.6
install_requires = install_requires =
# Our libraries # Our libraries
spacy-legacy>=3.0.9,<3.1.0 spacy-legacy>=3.0.10,<3.1.0
spacy-loggers>=1.0.0,<2.0.0 spacy-loggers>=1.0.0,<2.0.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
@ -42,9 +42,9 @@ install_requires =
wasabi>=0.9.1,<1.1.0 wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
# Third-party dependencies
typer>=0.3.0,<0.5.0 typer>=0.3.0,<0.5.0
pathy>=0.3.5 pathy>=0.3.5
# Third-party dependencies
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0

View File

@ -31,21 +31,21 @@ def load(
name: Union[str, Path], name: Union[str, Path],
*, *,
vocab: Union[Vocab, bool] = True, vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = util.SimpleFrozenList(), disable: Union[str, Iterable[str]] = util.SimpleFrozenList(),
enable: Iterable[str] = util.SimpleFrozenList(), enable: Union[str, Iterable[str]] = util.SimpleFrozenList(),
exclude: Iterable[str] = util.SimpleFrozenList(), exclude: Union[str, Iterable[str]] = util.SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
) -> Language: ) -> Language:
"""Load a spaCy model from an installed package or a local path. """Load a spaCy model from an installed package or a local path.
name (str): Package name or model path. name (str): Package name or model path.
vocab (Vocab): A Vocab object. If True, a vocab is created. vocab (Vocab): A Vocab object. If True, a vocab is created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe. enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (but can be enabled later using nlp.enable_pipe). pipes will be disabled (but can be enabled later using nlp.enable_pipe).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.

View File

@ -20,7 +20,7 @@ def download_cli(
ctx: typer.Context, ctx: typer.Context,
model: str = Arg(..., help="Name of pipeline package to download"), model: str = Arg(..., help="Name of pipeline package to download"),
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"), direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel") sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"),
# fmt: on # fmt: on
): ):
""" """
@ -36,7 +36,12 @@ def download_cli(
download(model, direct, sdist, *ctx.args) download(model, direct, sdist, *ctx.args)
def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None: def download(
model: str,
direct: bool = False,
sdist: bool = False,
*pip_args,
) -> None:
if ( if (
not (is_package("spacy") or is_package("spacy-nightly")) not (is_package("spacy") or is_package("spacy-nightly"))
and "--no-deps" not in pip_args and "--no-deps" not in pip_args
@ -50,13 +55,10 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
"dependencies, you'll have to install them manually." "dependencies, you'll have to install them manually."
) )
pip_args = pip_args + ("--no-deps",) pip_args = pip_args + ("--no-deps",)
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
if direct: if direct:
components = model.split("-") components = model.split("-")
model_name = "".join(components[:-1]) model_name = "".join(components[:-1])
version = components[-1] version = components[-1]
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
else: else:
model_name = model model_name = model
if model in OLD_MODEL_SHORTCUTS: if model in OLD_MODEL_SHORTCUTS:
@ -67,13 +69,26 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
model_name = OLD_MODEL_SHORTCUTS[model] model_name = OLD_MODEL_SHORTCUTS[model]
compatibility = get_compatibility() compatibility = get_compatibility()
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
filename = get_model_filename(model_name, version, sdist)
download_model(filename, pip_args)
msg.good( msg.good(
"Download and installation successful", "Download and installation successful",
f"You can now load the package via spacy.load('{model_name}')", f"You can now load the package via spacy.load('{model_name}')",
) )
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
dl_tpl = "{m}-{v}/{m}-{v}{s}"
egg_tpl = "#egg={m}=={v}"
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
if sdist:
filename += egg_tpl.format(m=model_name, v=version)
return filename
def get_compatibility() -> dict: def get_compatibility() -> dict:
if is_prerelease_version(about.__version__): if is_prerelease_version(about.__version__):
version: Optional[str] = about.__version__ version: Optional[str] = about.__version__
@ -105,6 +120,11 @@ def get_version(model: str, comp: dict) -> str:
return comp[model][0] return comp[model][0]
def get_latest_version(model: str) -> str:
comp = get_compatibility()
return get_version(model, comp)
def download_model( def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None: ) -> None:

View File

@ -1,10 +1,13 @@
from typing import Optional, Dict, Any, Union, List from typing import Optional, Dict, Any, Union, List
import platform import platform
import pkg_resources
import json
from pathlib import Path from pathlib import Path
from wasabi import Printer, MarkdownRenderer from wasabi import Printer, MarkdownRenderer
import srsly import srsly
from ._util import app, Arg, Opt, string_to_list from ._util import app, Arg, Opt, string_to_list
from .download import get_model_filename, get_latest_version
from .. import util from .. import util
from .. import about from .. import about
@ -16,6 +19,7 @@ def info_cli(
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"), markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"), silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"), exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
# fmt: on # fmt: on
): ):
""" """
@ -23,10 +27,19 @@ def info_cli(
print its meta information. Flag --markdown prints details in Markdown for easy print its meta information. Flag --markdown prints details in Markdown for easy
copy-pasting to GitHub issues. copy-pasting to GitHub issues.
Flag --url prints only the download URL of the most recent compatible
version of the pipeline.
DOCS: https://spacy.io/api/cli#info DOCS: https://spacy.io/api/cli#info
""" """
exclude = string_to_list(exclude) exclude = string_to_list(exclude)
info(model, markdown=markdown, silent=silent, exclude=exclude) info(
model,
markdown=markdown,
silent=silent,
exclude=exclude,
url=url,
)
def info( def info(
@ -35,11 +48,20 @@ def info(
markdown: bool = False, markdown: bool = False,
silent: bool = True, silent: bool = True,
exclude: Optional[List[str]] = None, exclude: Optional[List[str]] = None,
url: bool = False,
) -> Union[str, dict]: ) -> Union[str, dict]:
msg = Printer(no_print=silent, pretty=not silent) msg = Printer(no_print=silent, pretty=not silent)
if not exclude: if not exclude:
exclude = [] exclude = []
if model: if url:
if model is not None:
title = f"Download info for pipeline '{model}'"
data = info_model_url(model)
print(data["download_url"])
return data
else:
msg.fail("--url option requires a pipeline name", exits=1)
elif model:
title = f"Info about pipeline '{model}'" title = f"Info about pipeline '{model}'"
data = info_model(model, silent=silent) data = info_model(model, silent=silent)
else: else:
@ -99,11 +121,43 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
meta["source"] = str(model_path.resolve()) meta["source"] = str(model_path.resolve())
else: else:
meta["source"] = str(model_path) meta["source"] = str(model_path)
download_url = info_installed_model_url(model)
if download_url:
meta["download_url"] = download_url
return { return {
k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed") k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
} }
def info_installed_model_url(model: str) -> Optional[str]:
"""Given a pipeline name, get the download URL if available, otherwise
return None.
This is only available for pipelines installed as modules that have
dist-info available.
"""
try:
dist = pkg_resources.get_distribution(model)
data = json.loads(dist.get_metadata("direct_url.json"))
return data["url"]
except pkg_resources.DistributionNotFound:
# no such package
return None
except Exception:
# something else, like no file or invalid JSON
return None
def info_model_url(model: str) -> Dict[str, Any]:
"""Return the download URL for the latest version of a pipeline."""
version = get_latest_version(model)
filename = get_model_filename(model, version)
download_url = about.__download_url__ + "/" + filename
release_tpl = "https://github.com/explosion/spacy-models/releases/tag/{m}-{v}"
release_url = release_tpl.format(m=model, v=version)
return {"download_url": download_url, "release_url": release_url}
def get_markdown( def get_markdown(
data: Dict[str, Any], data: Dict[str, Any],
title: Optional[str] = None, title: Optional[str] = None,

View File

@ -230,8 +230,9 @@ class Errors(metaclass=ErrorsWithCodes):
"initialized component.") "initialized component.")
E004 = ("Can't set up pipeline component: a factory for '{name}' already " E004 = ("Can't set up pipeline component: a factory for '{name}' already "
"exists. Existing factory: {func}. New factory: {new_func}") "exists. Existing factory: {func}. New factory: {new_func}")
E005 = ("Pipeline component '{name}' returned None. If you're using a " E005 = ("Pipeline component '{name}' returned {returned_type} instead of a "
"custom component, maybe you forgot to return the processed Doc?") "Doc. If you're using a custom component, maybe you forgot to "
"return the processed Doc?")
E006 = ("Invalid constraints for adding pipeline component. You can only " E006 = ("Invalid constraints for adding pipeline component. You can only "
"set one of the following: before (component name or index), " "set one of the following: before (component name or index), "
"after (component name or index), first (True) or last (True). " "after (component name or index), first (True) or last (True). "

View File

@ -72,10 +72,10 @@ class CatalanLemmatizer(Lemmatizer):
oov_forms.append(form) oov_forms.append(form)
if not forms: if not forms:
forms.extend(oov_forms) forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(self.lookup_lemmatize(token)[0]) # use lookups, and fall back to the token itself
if not forms: if not forms:
forms.append(string) forms.append(lookup_table.get(string, [string])[0])
forms = list(dict.fromkeys(forms)) forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms self.cache[cache_key] = forms
return forms return forms

View File

@ -53,11 +53,16 @@ class FrenchLemmatizer(Lemmatizer):
rules = rules_table.get(univ_pos, []) rules = rules_table.get(univ_pos, [])
string = string.lower() string = string.lower()
forms = [] forms = []
# first try lookup in table based on upos
if string in index: if string in index:
forms.append(string) forms.append(string)
self.cache[cache_key] = forms self.cache[cache_key] = forms
return forms return forms
# then add anything in the exceptions table
forms.extend(exceptions.get(string, [])) forms.extend(exceptions.get(string, []))
# if nothing found yet, use the rules
oov_forms = [] oov_forms = []
if not forms: if not forms:
for old, new in rules: for old, new in rules:
@ -69,12 +74,14 @@ class FrenchLemmatizer(Lemmatizer):
forms.append(form) forms.append(form)
else: else:
oov_forms.append(form) oov_forms.append(form)
# if still nothing, add the oov forms from rules
if not forms: if not forms:
forms.extend(oov_forms) forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(self.lookup_lemmatize(token)[0]) # use lookups, which fall back to the token itself
if not forms: if not forms:
forms.append(string) forms.append(lookup_table.get(string, [string])[0])
forms = list(dict.fromkeys(forms)) forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms self.cache[cache_key] = forms
return forms return forms

18
spacy/lang/la/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from ...language import Language, BaseDefaults
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
class LatinDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Latin(Language):
lang = "la"
Defaults = LatinDefaults
__all__ = ["Latin"]

View File

@ -0,0 +1,34 @@
from ...attrs import LIKE_NUM
import re
# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4
roman_numerals_compile = re.compile(
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
)
_num_words = set(
"""
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
""".split()
)
_ordinal_words = set(
"""
primus prima primum secundus secunda secundum tertius tertia tertium
""".split()
)
def like_num(text):
if text.isdigit():
return True
if roman_numerals_compile.match(text):
return True
if text.lower() in _num_words:
return True
if text.lower() in _ordinal_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,37 @@
# Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin
STOP_WORDS = set(
"""
ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem
cum cur
de deinde dum
ego enim ergo es est et etiam etsi ex
fio
haud hic
iam idem igitur ille in infra inter interim ipse is ita
magis modo mox
nam ne nec necque neque nisi non nos
o ob
per possum post pro
quae quam quare qui quia quicumque quidem quilibet quis quisnam quisquam quisque quisquis quo quoniam
sed si sic sive sub sui sum super suus
tam tamen trans tu tum
ubi uel uero
vel vero
""".split()
)

View File

@ -0,0 +1,76 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
## TODO: Look into systematically handling u/v
_exc = {
"mecum": [{ORTH: "me"}, {ORTH: "cum"}],
"tecum": [{ORTH: "te"}, {ORTH: "cum"}],
"nobiscum": [{ORTH: "nobis"}, {ORTH: "cum"}],
"vobiscum": [{ORTH: "vobis"}, {ORTH: "cum"}],
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
}
for orth in [
"A.",
"Agr.",
"Ap.",
"C.",
"Cn.",
"D.",
"F.",
"K.",
"L.",
"M'.",
"M.",
"Mam.",
"N.",
"Oct.",
"Opet.",
"P.",
"Paul.",
"Post.",
"Pro.",
"Q.",
"S.",
"Ser.",
"Sert.",
"Sex.",
"St.",
"Sta.",
"T.",
"Ti.",
"V.",
"Vol.",
"Vop.",
"U.",
"Uol.",
"Uop.",
"Ian.",
"Febr.",
"Mart.",
"Apr.",
"Mai.",
"Iun.",
"Iul.",
"Aug.",
"Sept.",
"Oct.",
"Nov.",
"Nou.",
"Dec.",
"Non.",
"Id.",
"A.D.",
"Coll.",
"Cos.",
"Ord.",
"Pl.",
"S.C.",
"Suff.",
"Trib.",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1028,8 +1028,8 @@ class Language:
raise ValueError(Errors.E109.format(name=name)) from e raise ValueError(Errors.E109.format(name=name)) from e
except Exception as e: except Exception as e:
error_handler(name, proc, [doc], e) error_handler(name, proc, [doc], e)
if doc is None: if not isinstance(doc, Doc):
raise ValueError(Errors.E005.format(name=name)) raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
return doc return doc
def disable_pipes(self, *names) -> "DisabledPipes": def disable_pipes(self, *names) -> "DisabledPipes":
@ -1063,7 +1063,7 @@ class Language:
""" """
if enable is None and disable is None: if enable is None and disable is None:
raise ValueError(Errors.E991) raise ValueError(Errors.E991)
if disable is not None and isinstance(disable, str): if isinstance(disable, str):
disable = [disable] disable = [disable]
if enable is not None: if enable is not None:
if isinstance(enable, str): if isinstance(enable, str):
@ -1698,9 +1698,9 @@ class Language:
config: Union[Dict[str, Any], Config] = {}, config: Union[Dict[str, Any], Config] = {},
*, *,
vocab: Union[Vocab, bool] = True, vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(), enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
meta: Dict[str, Any] = SimpleFrozenDict(), meta: Dict[str, Any] = SimpleFrozenDict(),
auto_fill: bool = True, auto_fill: bool = True,
validate: bool = True, validate: bool = True,
@ -1711,12 +1711,12 @@ class Language:
config (Dict[str, Any] / Config): The loaded config. config (Dict[str, Any] / Config): The loaded config.
vocab (Vocab): A Vocab object. If True, a vocab is created. vocab (Vocab): A Vocab object. If True, a vocab is created.
disable (Iterable[str]): Names of pipeline components to disable. disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable.
Disabled pipes will be loaded but they won't be run unless you Disabled pipes will be loaded but they won't be run unless you
explicitly enable them by calling nlp.enable_pipe. explicitly enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`). pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude.
Excluded components won't be loaded. Excluded components won't be loaded.
meta (Dict[str, Any]): Meta overrides for nlp.meta. meta (Dict[str, Any]): Meta overrides for nlp.meta.
auto_fill (bool): Automatically fill in missing values in config based auto_fill (bool): Automatically fill in missing values in config based
@ -1727,6 +1727,12 @@ class Language:
DOCS: https://spacy.io/api/language#from_config DOCS: https://spacy.io/api/language#from_config
""" """
if isinstance(disable, str):
disable = [disable]
if isinstance(enable, str):
enable = [enable]
if isinstance(exclude, str):
exclude = [exclude]
if auto_fill: if auto_fill:
config = Config( config = Config(
cls.default_config, section_order=CONFIG_SECTION_ORDER cls.default_config, section_order=CONFIG_SECTION_ORDER
@ -2031,25 +2037,29 @@ class Language:
@staticmethod @staticmethod
def _resolve_component_status( def _resolve_component_status(
disable: Iterable[str], enable: Iterable[str], pipe_names: Collection[str] disable: Union[str, Iterable[str]],
enable: Union[str, Iterable[str]],
pipe_names: Iterable[str],
) -> Tuple[str, ...]: ) -> Tuple[str, ...]:
"""Derives whether (1) `disable` and `enable` values are consistent and (2) """Derives whether (1) `disable` and `enable` values are consistent and (2)
resolves those to a single set of disabled components. Raises an error in resolves those to a single set of disabled components. Raises an error in
case of inconsistency. case of inconsistency.
disable (Iterable[str]): Names of components or serialization fields to disable. disable (Union[str, Iterable[str]]): Name(s) of component(s) or serialization fields to disable.
enable (Iterable[str]): Names of pipeline components to enable. enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable.
pipe_names (Iterable[str]): Names of all pipeline components. pipe_names (Iterable[str]): Names of all pipeline components.
RETURNS (Tuple[str, ...]): Names of components to exclude from pipeline w.r.t. RETURNS (Tuple[str, ...]): Names of components to exclude from pipeline w.r.t.
specified includes and excludes. specified includes and excludes.
""" """
if disable is not None and isinstance(disable, str): if isinstance(disable, str):
disable = [disable] disable = [disable]
to_disable = disable to_disable = disable
if enable: if enable:
if isinstance(enable, str):
enable = [enable]
to_disable = [ to_disable = [
pipe_name for pipe_name in pipe_names if pipe_name not in enable pipe_name for pipe_name in pipe_names if pipe_name not in enable
] ]

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, cython: profile=True # cython: infer_types=True, cython: profile=True
from typing import List from typing import List, Iterable
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int8_t from libc.stdint cimport int32_t, int8_t
@ -868,20 +868,27 @@ class _SetPredicate:
def __call__(self, Token token): def __call__(self, Token token):
if self.is_extension: if self.is_extension:
value = get_string_id(token._.get(self.attr)) value = token._.get(self.attr)
else: else:
value = get_token_attr_for_matcher(token.c, self.attr) value = get_token_attr_for_matcher(token.c, self.attr)
if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"): if self.predicate in ("IN", "NOT_IN"):
if isinstance(value, (str, int)):
value = get_string_id(value)
else:
return False
elif self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
# ensure that all values are enclosed in a set
if self.attr == MORPH: if self.attr == MORPH:
# break up MORPH into individual Feat=Val values # break up MORPH into individual Feat=Val values
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value)) value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
elif isinstance(value, (str, int)):
value = set((get_string_id(value),))
elif isinstance(value, Iterable) and all(isinstance(v, (str, int)) for v in value):
value = set(get_string_id(v) for v in value)
else: else:
# treat a single value as a list return False
if isinstance(value, (str, int)):
value = set([get_string_id(value)])
else:
value = set(get_string_id(v) for v in value)
if self.predicate == "IN": if self.predicate == "IN":
return value in self.value return value in self.value
elif self.predicate == "NOT_IN": elif self.predicate == "NOT_IN":

View File

@ -256,6 +256,11 @@ def ko_tokenizer_tokenizer():
return nlp.tokenizer return nlp.tokenizer
@pytest.fixture(scope="module")
def la_tokenizer():
return get_lang_class("la")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ko_tokenizer_natto(): def ko_tokenizer_natto():
pytest.importorskip("natto") pytest.importorskip("natto")

View File

View File

@ -0,0 +1,8 @@
import pytest
def test_la_tokenizer_handles_exc_in_text(la_tokenizer):
text = "scio te omnia facturum, ut nobiscum quam primum sis"
tokens = la_tokenizer(text)
assert len(tokens) == 11
assert tokens[6].text == "nobis"

View File

@ -0,0 +1,35 @@
import pytest
from spacy.lang.la.lex_attrs import like_num
@pytest.mark.parametrize(
"text,match",
[
("IIII", True),
("VI", True),
("vi", True),
("IV", True),
("iv", True),
("IX", True),
("ix", True),
("MMXXII", True),
("0", True),
("1", True),
("quattuor", True),
("decem", True),
("tertius", True),
("canis", False),
("MMXX11", False),
(",", False),
],
)
def test_lex_attrs_like_number(la_tokenizer, text, match):
tokens = la_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match
@pytest.mark.parametrize("word", ["quinque"])
def test_la_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -368,6 +368,16 @@ def test_matcher_intersect_value_operator(en_vocab):
doc[0]._.ext = ["A", "B"] doc[0]._.ext = ["A", "B"]
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
# INTERSECTS matches nothing for iterables that aren't all str or int
matcher = Matcher(en_vocab)
pattern = [{"_": {"ext": {"INTERSECTS": ["Abx", "C"]}}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0]._.ext = [["Abx"], "B"]
assert len(matcher(doc)) == 0
doc[0]._.ext = ["Abx", "B"]
assert len(matcher(doc)) == 1
# INTERSECTS with an empty pattern list matches nothing # INTERSECTS with an empty pattern list matches nothing
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
pattern = [{"_": {"ext": {"INTERSECTS": []}}}] pattern = [{"_": {"ext": {"INTERSECTS": []}}}]
@ -476,14 +486,22 @@ def test_matcher_extension_set_membership(en_vocab):
assert len(matches) == 0 assert len(matches) == 0
@pytest.mark.xfail(reason="IN predicate must handle sequence values in extensions")
def test_matcher_extension_in_set_predicate(en_vocab): def test_matcher_extension_in_set_predicate(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
Token.set_extension("ext", default=[]) Token.set_extension("ext", default=[])
pattern = [{"_": {"ext": {"IN": ["A", "C"]}}}] pattern = [{"_": {"ext": {"IN": ["A", "C"]}}}]
matcher.add("M", [pattern]) matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
# The IN predicate expects an exact match between the
# extension value and one of the pattern's values.
doc[0]._.ext = ["A", "B"] doc[0]._.ext = ["A", "B"]
assert len(matcher(doc)) == 0
doc[0]._.ext = ["A"]
assert len(matcher(doc)) == 0
doc[0]._.ext = "A"
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1

View File

@ -17,6 +17,7 @@ def test_build_dependencies():
"types-dataclasses", "types-dataclasses",
"types-mock", "types-mock",
"types-requests", "types-requests",
"types-setuptools",
] ]
# ignore language-specific packages that shouldn't be installed by all # ignore language-specific packages that shouldn't be installed by all
libs_ignore_setup = [ libs_ignore_setup = [

View File

@ -618,6 +618,7 @@ def test_load_disable_enable() -> None:
base_nlp.to_disk(tmp_dir) base_nlp.to_disk(tmp_dir)
to_disable = ["parser", "tagger"] to_disable = ["parser", "tagger"]
to_enable = ["tagger", "parser"] to_enable = ["tagger", "parser"]
single_str = "tagger"
# Setting only `disable`. # Setting only `disable`.
nlp = spacy.load(tmp_dir, disable=to_disable) nlp = spacy.load(tmp_dir, disable=to_disable)
@ -632,6 +633,16 @@ def test_load_disable_enable() -> None:
] ]
) )
# Loading with a string representing one component
nlp = spacy.load(tmp_dir, exclude=single_str)
assert single_str not in nlp.component_names
nlp = spacy.load(tmp_dir, disable=single_str)
assert single_str in nlp.component_names
assert single_str not in nlp.pipe_names
assert nlp._disabled == {single_str}
assert nlp.disabled == [single_str]
# Testing consistent enable/disable combination. # Testing consistent enable/disable combination.
nlp = spacy.load( nlp = spacy.load(
tmp_dir, tmp_dir,

View File

@ -670,3 +670,25 @@ def test_dot_in_factory_names(nlp):
with pytest.raises(ValueError, match="not permitted"): with pytest.raises(ValueError, match="not permitted"):
Language.factory("my.evil.component.v1", func=evil_component) Language.factory("my.evil.component.v1", func=evil_component)
def test_component_return():
"""Test that an error is raised if components return a type other than a
doc."""
nlp = English()
@Language.component("test_component_good_pipe")
def good_pipe(doc):
return doc
nlp.add_pipe("test_component_good_pipe")
nlp("text")
nlp.remove_pipe("test_component_good_pipe")
@Language.component("test_component_bad_pipe")
def bad_pipe(doc):
return doc.text
nlp.add_pipe("test_component_bad_pipe")
with pytest.raises(ValueError, match="instead of a Doc"):
nlp("text")

View File

@ -10,7 +10,8 @@ from spacy.ml._precomputable_affine import _backprop_precomputable_affine_paddin
from spacy.util import dot_to_object, SimpleFrozenList, import_file from spacy.util import dot_to_object, SimpleFrozenList, import_file
from spacy.util import to_ternary_int from spacy.util import to_ternary_int
from thinc.api import Config, Optimizer, ConfigValidationError from thinc.api import Config, Optimizer, ConfigValidationError
from thinc.api import set_current_ops from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
from spacy.training.batchers import minibatch_by_words from spacy.training.batchers import minibatch_by_words
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.nl import Dutch from spacy.lang.nl import Dutch
@ -18,7 +19,6 @@ from spacy.language import DEFAULT_CONFIG_PATH
from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema
from pydantic import ValidationError from pydantic import ValidationError
from thinc.api import get_current_ops, NumpyOps, CupyOps
from .util import get_random_doc, make_tempdir from .util import get_random_doc, make_tempdir
@ -111,26 +111,25 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
def test_prefer_gpu(): def test_prefer_gpu():
current_ops = get_current_ops() current_ops = get_current_ops()
try: if has_cupy_gpu:
import cupy # noqa: F401 assert prefer_gpu()
prefer_gpu()
assert isinstance(get_current_ops(), CupyOps) assert isinstance(get_current_ops(), CupyOps)
except ImportError: elif has_torch_mps_gpu:
assert prefer_gpu()
assert isinstance(get_current_ops(), MPSOps)
else:
assert not prefer_gpu() assert not prefer_gpu()
set_current_ops(current_ops) set_current_ops(current_ops)
def test_require_gpu(): def test_require_gpu():
current_ops = get_current_ops() current_ops = get_current_ops()
try: if has_cupy_gpu:
import cupy # noqa: F401
require_gpu() require_gpu()
assert isinstance(get_current_ops(), CupyOps) assert isinstance(get_current_ops(), CupyOps)
except ImportError: elif has_torch_mps_gpu:
with pytest.raises(ValueError): require_gpu()
require_gpu() assert isinstance(get_current_ops(), MPSOps)
set_current_ops(current_ops) set_current_ops(current_ops)

View File

@ -0,0 +1,30 @@
import pytest
import spacy
from spacy.training import loggers
@pytest.fixture()
def nlp():
nlp = spacy.blank("en")
nlp.add_pipe("ner")
return nlp
@pytest.fixture()
def info():
return {
"losses": {"ner": 100},
"other_scores": {"ENTS_F": 0.85, "ENTS_P": 0.90, "ENTS_R": 0.80},
"epoch": 100,
"step": 125,
"score": 85,
}
def test_console_logger(nlp, info):
console_logger = loggers.console_logger(
progress_bar=True, console_output=True, output_file=None
)
log_step, finalize = console_logger(nlp)
log_step(info)

View File

@ -1,4 +1,4 @@
from typing import Any, Dict, Iterable from typing import Any, Dict, Iterable, Optional
from .doc import Doc from .doc import Doc
from .span import Span from .span import Span
@ -24,4 +24,4 @@ class SpanGroup:
def __getitem__(self, i: int) -> Span: ... def __getitem__(self, i: int) -> Span: ...
def to_bytes(self) -> bytes: ... def to_bytes(self) -> bytes: ...
def from_bytes(self, bytes_data: bytes) -> SpanGroup: ... def from_bytes(self, bytes_data: bytes) -> SpanGroup: ...
def copy(self) -> SpanGroup: ... def copy(self, doc: Optional[Doc] = ...) -> SpanGroup: ...

View File

@ -244,15 +244,18 @@ cdef class SpanGroup:
cdef void push_back(self, const shared_ptr[SpanC] &span): cdef void push_back(self, const shared_ptr[SpanC] &span):
self.c.push_back(span) self.c.push_back(span)
def copy(self) -> SpanGroup: def copy(self, doc: Optional["Doc"] = None) -> SpanGroup:
"""Clones the span group. """Clones the span group.
doc (Doc): New reference document to which the copy is bound.
RETURNS (SpanGroup): A copy of the span group. RETURNS (SpanGroup): A copy of the span group.
DOCS: https://spacy.io/api/spangroup#copy DOCS: https://spacy.io/api/spangroup#copy
""" """
if doc is None:
doc = self.doc
return SpanGroup( return SpanGroup(
self.doc, doc,
name=self.name, name=self.name,
attrs=deepcopy(self.attrs), attrs=deepcopy(self.attrs),
spans=list(self), spans=list(self),

View File

@ -42,7 +42,8 @@ class SpanGroups(UserDict):
def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups": def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups":
if doc is None: if doc is None:
doc = self._ensure_doc() doc = self._ensure_doc()
return SpanGroups(doc).from_bytes(self.to_bytes()) data_copy = ((k, v.copy(doc=doc)) for k, v in self.items())
return SpanGroups(doc, items=data_copy)
def setdefault(self, key, default=None): def setdefault(self, key, default=None):
if not isinstance(default, SpanGroup): if not isinstance(default, SpanGroup):

View File

@ -1,10 +1,13 @@
from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO, Union
from wasabi import Printer from wasabi import Printer
from pathlib import Path
import tqdm import tqdm
import sys import sys
import srsly
from ..util import registry from ..util import registry
from ..errors import Errors from ..errors import Errors
from .. import util
if TYPE_CHECKING: if TYPE_CHECKING:
from ..language import Language # noqa: F401 from ..language import Language # noqa: F401
@ -23,13 +26,44 @@ def setup_table(
return final_cols, final_widths, ["r" for _ in final_widths] return final_cols, final_widths, ["r" for _ in final_widths]
@registry.loggers("spacy.ConsoleLogger.v1") @registry.loggers("spacy.ConsoleLogger.v2")
def console_logger(progress_bar: bool = False): def console_logger(
progress_bar: bool = False,
console_output: bool = True,
output_file: Optional[Union[str, Path]] = None,
):
"""The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
progress_bar (bool): Whether the logger should print the progress bar.
console_output (bool): Whether the logger should print the logs on the console.
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
"""
_log_exist = False
if output_file:
output_file = util.ensure_path(output_file) # type: ignore
if output_file.exists(): # type: ignore
_log_exist = True
if not output_file.parents[0].exists(): # type: ignore
output_file.parents[0].mkdir(parents=True) # type: ignore
def setup_printer( def setup_printer(
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]: ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
write = lambda text: print(text, file=stdout, flush=True) write = lambda text: print(text, file=stdout, flush=True)
msg = Printer(no_print=True) msg = Printer(no_print=True)
nonlocal output_file
output_stream = None
if _log_exist:
write(
msg.warn(
f"Saving logs is disabled because {output_file} already exists."
)
)
output_file = None
elif output_file:
write(msg.info(f"Saving results to {output_file}"))
output_stream = open(output_file, "w", encoding="utf-8")
# ensure that only trainable components are logged # ensure that only trainable components are logged
logged_pipes = [ logged_pipes = [
name name
@ -40,13 +74,15 @@ def console_logger(progress_bar: bool = False):
score_weights = nlp.config["training"]["score_weights"] score_weights = nlp.config["training"]["score_weights"]
score_cols = [col for col, value in score_weights.items() if value is not None] score_cols = [col for col, value in score_weights.items() if value is not None]
loss_cols = [f"Loss {pipe}" for pipe in logged_pipes] loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
spacing = 2
table_header, table_widths, table_aligns = setup_table( if console_output:
cols=["E", "#"] + loss_cols + score_cols + ["Score"], spacing = 2
widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6], table_header, table_widths, table_aligns = setup_table(
) cols=["E", "#"] + loss_cols + score_cols + ["Score"],
write(msg.row(table_header, widths=table_widths, spacing=spacing)) widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
write(msg.row(["-" * width for width in table_widths], spacing=spacing)) )
write(msg.row(table_header, widths=table_widths, spacing=spacing))
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
progress = None progress = None
def log_step(info: Optional[Dict[str, Any]]) -> None: def log_step(info: Optional[Dict[str, Any]]) -> None:
@ -57,12 +93,15 @@ def console_logger(progress_bar: bool = False):
if progress is not None: if progress is not None:
progress.update(1) progress.update(1)
return return
losses = [
"{0:.2f}".format(float(info["losses"][pipe_name])) losses = []
for pipe_name in logged_pipes log_losses = {}
] for pipe_name in logged_pipes:
losses.append("{0:.2f}".format(float(info["losses"][pipe_name])))
log_losses[pipe_name] = float(info["losses"][pipe_name])
scores = [] scores = []
log_scores = {}
for col in score_cols: for col in score_cols:
score = info["other_scores"].get(col, 0.0) score = info["other_scores"].get(col, 0.0)
try: try:
@ -73,6 +112,7 @@ def console_logger(progress_bar: bool = False):
if col != "speed": if col != "speed":
score *= 100 score *= 100
scores.append("{0:.2f}".format(score)) scores.append("{0:.2f}".format(score))
log_scores[str(col)] = score
data = ( data = (
[info["epoch"], info["step"]] [info["epoch"], info["step"]]
@ -80,20 +120,36 @@ def console_logger(progress_bar: bool = False):
+ scores + scores
+ ["{0:.2f}".format(float(info["score"]))] + ["{0:.2f}".format(float(info["score"]))]
) )
if output_stream:
# Write to log file per log_step
log_data = {
"epoch": info["epoch"],
"step": info["step"],
"losses": log_losses,
"scores": log_scores,
"score": float(info["score"]),
}
output_stream.write(srsly.json_dumps(log_data) + "\n")
if progress is not None: if progress is not None:
progress.close() progress.close()
write( if console_output:
msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing) write(
) msg.row(
if progress_bar: data, widths=table_widths, aligns=table_aligns, spacing=spacing
# Set disable=None, so that it disables on non-TTY )
progress = tqdm.tqdm(
total=eval_frequency, disable=None, leave=False, file=stderr
) )
progress.set_description(f"Epoch {info['epoch']+1}") if progress_bar:
# Set disable=None, so that it disables on non-TTY
progress = tqdm.tqdm(
total=eval_frequency, disable=None, leave=False, file=stderr
)
progress.set_description(f"Epoch {info['epoch']+1}")
def finalize() -> None: def finalize() -> None:
pass if output_stream:
output_stream.close()
return log_step, finalize return log_step, finalize

View File

@ -398,9 +398,9 @@ def load_model(
name: Union[str, Path], name: Union[str, Path],
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(), enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from a package or data path. """Load a model from a package or data path.
@ -408,9 +408,9 @@ def load_model(
name (str): Package name or model path. name (str): Package name or model path.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True, vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created. a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable.
enable (Iterable[str]): Names of pipeline components to enable. All others will be disabled. enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All others will be disabled.
exclude (Iterable[str]): Names of pipeline components to exclude. exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
@ -440,9 +440,9 @@ def load_model_from_package(
name: str, name: str,
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(), enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from an installed package. """Load a model from an installed package.
@ -450,12 +450,12 @@ def load_model_from_package(
name (str): The package name. name (str): The package name.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True, vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created. a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe. enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`). pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
@ -470,9 +470,9 @@ def load_model_from_path(
*, *,
meta: Optional[Dict[str, Any]] = None, meta: Optional[Dict[str, Any]] = None,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(), enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from a data directory path. Creates Language class with """Load a model from a data directory path. Creates Language class with
@ -482,12 +482,12 @@ def load_model_from_path(
meta (Dict[str, Any]): Optional model meta. meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True, vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created. a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe. enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`). pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
@ -516,9 +516,9 @@ def load_model_from_config(
*, *,
meta: Dict[str, Any] = SimpleFrozenDict(), meta: Dict[str, Any] = SimpleFrozenDict(),
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(), enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
auto_fill: bool = False, auto_fill: bool = False,
validate: bool = True, validate: bool = True,
) -> "Language": ) -> "Language":
@ -529,12 +529,12 @@ def load_model_from_config(
meta (Dict[str, Any]): Optional model meta. meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True, vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created. a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe. enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`). pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded. components won't be loaded.
auto_fill (bool): Whether to auto-fill config with missing defaults. auto_fill (bool): Whether to auto-fill config with missing defaults.
validate (bool): Whether to show config validation errors. validate (bool): Whether to show config validation errors.
@ -616,9 +616,9 @@ def load_model_from_init_py(
init_file: Union[Path, str], init_file: Union[Path, str],
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(), enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Helper function to use in the `load()` method of a model package's """Helper function to use in the `load()` method of a model package's
@ -626,12 +626,12 @@ def load_model_from_init_py(
vocab (Vocab / True): Optional vocab to pass in on initialization. If True, vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created. a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe. enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`). pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.

View File

@ -77,14 +77,15 @@ $ python -m spacy info [--markdown] [--silent] [--exclude]
$ python -m spacy info [model] [--markdown] [--silent] [--exclude] $ python -m spacy info [model] [--markdown] [--silent] [--exclude]
``` ```
| Name | Description | | Name | Description |
| ------------------------------------------------ | --------------------------------------------------------------------------------------------- | | ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------- |
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~ | | `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~ |
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ | | `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ | | `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
| `--exclude`, `-e` | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ | | `--exclude`, `-e` | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--url`, `-u` <Tag variant="new">3.5.0</Tag> | Print the URL to download the most recent compatible version of the pipeline. Requires a pipeline name. ~~bool (flag)~~ |
| **PRINTS** | Information about your spaCy installation. | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Information about your spaCy installation. |
## validate {#validate new="2" tag="command"} ## validate {#validate new="2" tag="command"}

View File

@ -63,17 +63,18 @@ spaCy loads a model under the hood based on its
> nlp = Language.from_config(config) > nlp = Language.from_config(config)
> ``` > ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | | `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | | `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | | `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `exclude` | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | | `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | | `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | | `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The initialized object. ~~Language~~ | | `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The initialized object. ~~Language~~ |
## Language.component {#component tag="classmethod" new="3"} ## Language.component {#component tag="classmethod" new="3"}
@ -695,8 +696,8 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------ |
| _keyword-only_ | | | _keyword-only_ | |
| `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ | | `disable` | Name(s) of pipeline component(s) to disable. ~~Optional[Union[str, Iterable[str]]]~~ |
| `enable` | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ | | `enable` | Name(s) of pipeline component(s) that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
| **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ | | **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ |
## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"} ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}

View File

@ -248,6 +248,59 @@ added to an existing vectors table. See more details in
## Loggers {#loggers} ## Loggers {#loggers}
These functions are available from `@spacy.registry.loggers`.
### spacy.ConsoleLogger.v1 {#ConsoleLogger_v1}
> #### Example config
>
> ```ini
> [training.logger]
> @loggers = "spacy.ConsoleLogger.v1"
> progress_bar = true
> ```
Writes the results of a training step to the console in a tabular format.
<Accordion title="Example console output" spaced>
```cli
$ python -m spacy train config.cfg
```
```
Using CPU
Loading config and nlp from: config.cfg
Pipeline: ['tok2vec', 'tagger']
Start training
Training. Initial learn rate: 0.0
E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE
--- ------ ------------ ----------- ------- ------
0 0 0.00 86.20 0.22 0.00
0 200 3.08 18968.78 34.00 0.34
0 400 31.81 22539.06 33.64 0.34
0 600 92.13 22794.91 43.80 0.44
0 800 183.62 21541.39 56.05 0.56
0 1000 352.49 25461.82 65.15 0.65
0 1200 422.87 23708.82 71.84 0.72
0 1400 601.92 24994.79 76.57 0.77
0 1600 662.57 22268.02 80.20 0.80
0 1800 1101.50 28413.77 82.56 0.83
0 2000 1253.43 28736.36 85.00 0.85
0 2200 1411.02 28237.53 87.42 0.87
0 2400 1605.35 28439.95 88.70 0.89
```
Note that the cumulative loss keeps increasing within one epoch, but should
start decreasing across epochs.
</Accordion>
| Name | Description |
| -------------- | --------------------------------------------------------- |
| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
Logging utilities for spaCy are implemented in the Logging utilities for spaCy are implemented in the
[`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the [`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the
functions are typically available from `@spacy.registry.loggers`. functions are typically available from `@spacy.registry.loggers`.

View File

@ -255,9 +255,10 @@ Return a copy of the span group.
> new_group = doc.spans["errors"].copy() > new_group = doc.spans["errors"].copy()
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ----------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------- |
| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ | | `doc` | The document to which the copy is bound. Defaults to `None` for the current doc. ~~Optional[Doc]~~ |
| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
## SpanGroup.to_bytes {#to_bytes tag="method"} ## SpanGroup.to_bytes {#to_bytes tag="method"}

View File

@ -45,16 +45,16 @@ specified separately using the new `exclude` keyword argument.
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"]) > nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
> ``` > ```
| Name | Description | | Name | Description |
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | | `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | | `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `enable` | Names of pipeline components to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~List[str]~~ | | `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `exclude` <Tag variant="new">3</Tag> | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | | **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
[`config.cfg`](/api/data-formats#config), uses the language and pipeline [`config.cfg`](/api/data-formats#config), uses the language and pipeline
@ -275,8 +275,8 @@ Render a dependency parse tree or named entity visualization.
### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"} ### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"}
Generate dependency parse in `{'words': [], 'arcs': []}` format. Generate dependency parse in `{'words': [], 'arcs': []}` format. For use with
For use with the `manual=True` argument in `displacy.render`. the `manual=True` argument in `displacy.render`.
> #### Example > #### Example
> >
@ -297,8 +297,8 @@ For use with the `manual=True` argument in `displacy.render`.
### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"} ### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"}
Generate named entities in `[{start: i, end: i, label: 'label'}]` format. Generate named entities in `[{start: i, end: i, label: 'label'}]` format. For
For use with the `manual=True` argument in `displacy.render`. use with the `manual=True` argument in `displacy.render`.
> #### Example > #### Example
> >
@ -319,8 +319,8 @@ For use with the `manual=True` argument in `displacy.render`.
### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"} ### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"}
Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format. Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format. For
For use with the `manual=True` argument in `displacy.render`. use with the `manual=True` argument in `displacy.render`.
> #### Example > #### Example
> >
@ -451,7 +451,7 @@ factories.
| Registry name | Description | | Registry name | Description |
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | | `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. | | `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. |
| `batchers` | Registry for training and evaluation [data batchers](#batchers). | | `batchers` | Registry for training and evaluation [data batchers](#batchers). |
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | | `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
@ -505,7 +505,7 @@ finished. To log each training step, a
and the accuracy scores on the development set. and the accuracy scores on the development set.
The built-in, default logger is the ConsoleLogger, which prints results to the The built-in, default logger is the ConsoleLogger, which prints results to the
console in tabular format. The console in tabular format and saves them to a `jsonl` file. The
[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
a dependency of spaCy, enables other loggers, such as one that sends results to a dependency of spaCy, enables other loggers, such as one that sends results to
a [Weights & Biases](https://www.wandb.com/) dashboard. a [Weights & Biases](https://www.wandb.com/) dashboard.
@ -513,16 +513,20 @@ a [Weights & Biases](https://www.wandb.com/) dashboard.
Instead of using one of the built-in loggers, you can Instead of using one of the built-in loggers, you can
[implement your own](/usage/training#custom-logging). [implement your own](/usage/training#custom-logging).
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"} #### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"}
> #### Example config > #### Example config
> >
> ```ini > ```ini
> [training.logger] > [training.logger]
> @loggers = "spacy.ConsoleLogger.v1" > @loggers = "spacy.ConsoleLogger.v2"
> progress_bar = true
> console_output = true
> output_file = "training_log.jsonl"
> ``` > ```
Writes the results of a training step to the console in a tabular format. Writes the results of a training step to the console in a tabular format and
saves them to a `jsonl` file.
<Accordion title="Example console output" spaced> <Accordion title="Example console output" spaced>
@ -536,22 +540,23 @@ $ python -m spacy train config.cfg
Pipeline: ['tok2vec', 'tagger'] Pipeline: ['tok2vec', 'tagger']
Start training Start training
Training. Initial learn rate: 0.0 Training. Initial learn rate: 0.0
Saving results to training_log.jsonl
E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE
--- ------ ------------ ----------- ------- ------ --- ------ ------------ ----------- ------- ------
1 0 0.00 86.20 0.22 0.00 0 0 0.00 86.20 0.22 0.00
1 200 3.08 18968.78 34.00 0.34 0 200 3.08 18968.78 34.00 0.34
1 400 31.81 22539.06 33.64 0.34 0 400 31.81 22539.06 33.64 0.34
1 600 92.13 22794.91 43.80 0.44 0 600 92.13 22794.91 43.80 0.44
1 800 183.62 21541.39 56.05 0.56 0 800 183.62 21541.39 56.05 0.56
1 1000 352.49 25461.82 65.15 0.65 0 1000 352.49 25461.82 65.15 0.65
1 1200 422.87 23708.82 71.84 0.72 0 1200 422.87 23708.82 71.84 0.72
1 1400 601.92 24994.79 76.57 0.77 0 1400 601.92 24994.79 76.57 0.77
1 1600 662.57 22268.02 80.20 0.80 0 1600 662.57 22268.02 80.20 0.80
1 1800 1101.50 28413.77 82.56 0.83 0 1800 1101.50 28413.77 82.56 0.83
1 2000 1253.43 28736.36 85.00 0.85 0 2000 1253.43 28736.36 85.00 0.85
1 2200 1411.02 28237.53 87.42 0.87 0 2200 1411.02 28237.53 87.42 0.87
1 2400 1605.35 28439.95 88.70 0.89 0 2400 1605.35 28439.95 88.70 0.89
``` ```
Note that the cumulative loss keeps increasing within one epoch, but should Note that the cumulative loss keeps increasing within one epoch, but should
@ -559,6 +564,12 @@ start decreasing across epochs.
</Accordion> </Accordion>
| Name | Description |
| ---------------- | --------------------------------------------------------------------- |
| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ |
| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ |
## Readers {#readers} ## Readers {#readers}
### File readers {#file-readers source="github.com/explosion/srsly" new="3"} ### File readers {#file-readers source="github.com/explosion/srsly" new="3"}
@ -1038,15 +1049,16 @@ and create a `Language` object. The model data will then be loaded in via
> nlp = util.load_model("/path/to/data") > nlp = util.load_model("/path/to/data")
> ``` > ```
| Name | Description | | Name | Description |
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | Package name or path. ~~str~~ | | `name` | Package name or path. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | | `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | | `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ | | `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"} ### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
@ -1062,15 +1074,16 @@ A helper function to use in the `load()` method of a pipeline package's
> return load_model_from_init_py(__file__, **overrides) > return load_model_from_init_py(__file__, **overrides)
> ``` > ```
| Name | Description | | Name | Description |
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ | | `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | | `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | | `exclude` <Tag variant="new">3</Tag> | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ | | `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
### util.load_config {#util.load_config tag="function" new="3"} ### util.load_config {#util.load_config tag="function" new="3"}

View File

@ -396,15 +396,32 @@ pipeline package can be found.
To download a trained pipeline directly using To download a trained pipeline directly using
[pip](https://pypi.python.org/pypi/pip), point `pip install` to the URL or local [pip](https://pypi.python.org/pypi/pip), point `pip install` to the URL or local
path of the wheel file or archive. Installing the wheel is usually more path of the wheel file or archive. Installing the wheel is usually more
efficient. To find the direct link to a package, head over to the efficient.
[releases](https://github.com/explosion/spacy-models/releases), right click on
the archive link and copy it to your clipboard. > #### Pipeline Package URLs {#pipeline-urls}
>
> Pretrained pipeline distributions are hosted on
> [Github Releases](https://github.com/explosion/spacy-models/releases), and you
> can find download links there, as well as on the model page. You can also get
> URLs directly from the command line by using `spacy info` with the `--url`
> flag, which may be useful for automation.
>
> ```bash
> spacy info en_core_web_sm --url
> ```
>
> This command will print the URL for the latest version of a pipeline
> compatible with the version of spaCy you're using. Note that in order to look
> up the compatibility information an internet connection is required.
```bash ```bash
# With external URL # With external URL
$ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl $ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl
$ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz $ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
# Using spacy info to get the external URL
$ pip install $(spacy info en_core_web_sm --url)
# With local file # With local file
$ pip install /Users/you/en_core_web_sm-3.0.0-py3-none-any.whl $ pip install /Users/you/en_core_web_sm-3.0.0-py3-none-any.whl
$ pip install /Users/you/en_core_web_sm-3.0.0.tar.gz $ pip install /Users/you/en_core_web_sm-3.0.0.tar.gz
@ -545,21 +562,16 @@ should be specifying them directly.
Because pipeline packages are valid Python packages, you can add them to your Because pipeline packages are valid Python packages, you can add them to your
application's `requirements.txt`. If you're running your own internal PyPi application's `requirements.txt`. If you're running your own internal PyPi
installation, you can upload the pipeline packages there. pip's installation, you can upload the pipeline packages there. pip's
[requirements file format](https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format) [requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/)
supports both package names to download via a PyPi server, as well as direct supports both package names to download via a PyPi server, as well as
URLs. [direct URLs](#pipeline-urls).
```text ```text
### requirements.txt ### requirements.txt
spacy>=3.0.0,<4.0.0 spacy>=3.0.0,<4.0.0
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl
``` ```
Specifying `#egg=` with the package name tells pip which package to expect from
the download URL. This way, the package won't be re-downloaded and overwritten
if it's already installed - just like when you're downloading a package from
PyPi.
All pipeline packages are versioned and specify their spaCy dependency. This All pipeline packages are versioned and specify their spaCy dependency. This
ensures cross-compatibility and lets you specify exact version requirements for ensures cross-compatibility and lets you specify exact version requirements for
each pipeline. If you've [trained](/usage/training) your own pipeline, you can each pipeline. If you've [trained](/usage/training) your own pipeline, you can

View File

@ -1192,7 +1192,7 @@
"slogan": "Fast, flexible and transparent sentiment analysis", "slogan": "Fast, flexible and transparent sentiment analysis",
"description": "Asent is a rule-based sentiment analysis library for Python made using spaCy. It is inspired by VADER, but uses a more modular ruleset, that allows the user to change e.g. the method for finding negations. Furthermore it includes visualisers to visualize the model predictions, making the model easily interpretable.", "description": "Asent is a rule-based sentiment analysis library for Python made using spaCy. It is inspired by VADER, but uses a more modular ruleset, that allows the user to change e.g. the method for finding negations. Furthermore it includes visualisers to visualize the model predictions, making the model easily interpretable.",
"github": "kennethenevoldsen/asent", "github": "kennethenevoldsen/asent",
"pip": "aseny", "pip": "asent",
"code_example": [ "code_example": [
"import spacy", "import spacy",
"import asent", "import asent",

View File

@ -76,6 +76,7 @@ const MODEL_META = {
benchmark_ner: 'NER accuracy', benchmark_ner: 'NER accuracy',
benchmark_speed: 'Speed', benchmark_speed: 'Speed',
compat: 'Latest compatible package version for your spaCy installation', compat: 'Latest compatible package version for your spaCy installation',
download_link: 'Download link for the pipeline',
} }
const LABEL_SCHEME_META = { const LABEL_SCHEME_META = {
@ -138,6 +139,13 @@ function formatAccuracy(data, lang) {
.filter(item => item) .filter(item => item)
} }
function formatDownloadLink(lang, name, version) {
const fullName = `${lang}_${name}-${version}`
const filename = `${fullName}-py3-none-any.whl`
const url = `https://github.com/explosion/spacy-models/releases/download/${fullName}/${filename}`
return <Link to={url} hideIcon>{filename}</Link>
}
function formatModelMeta(data) { function formatModelMeta(data) {
return { return {
fullName: `${data.lang}_${data.name}-${data.version}`, fullName: `${data.lang}_${data.name}-${data.version}`,
@ -154,6 +162,7 @@ function formatModelMeta(data) {
labels: isEmptyObj(data.labels) ? null : data.labels, labels: isEmptyObj(data.labels) ? null : data.labels,
vectors: formatVectors(data.vectors), vectors: formatVectors(data.vectors),
accuracy: formatAccuracy(data.performance, data.lang), accuracy: formatAccuracy(data.performance, data.lang),
download_link: formatDownloadLink(data.lang, data.name, data.version),
} }
} }
@ -244,6 +253,7 @@ const Model = ({
{ label: 'Components', content: components, help: MODEL_META.components }, { label: 'Components', content: components, help: MODEL_META.components },
{ label: 'Pipeline', content: pipeline, help: MODEL_META.pipeline }, { label: 'Pipeline', content: pipeline, help: MODEL_META.pipeline },
{ label: 'Vectors', content: meta.vectors, help: MODEL_META.vecs }, { label: 'Vectors', content: meta.vectors, help: MODEL_META.vecs },
{ label: 'Download Link', content: meta.download_link, help: MODEL_META.download_link },
{ label: 'Sources', content: sources, help: MODEL_META.sources }, { label: 'Sources', content: sources, help: MODEL_META.sources },
{ label: 'Author', content: author }, { label: 'Author', content: author },
{ label: 'License', content: license }, { label: 'License', content: license },