Merge pull request #11448 from shadeMe/merge-develop-into-v4

Merge `develop` into `v4`
This commit is contained in:
Madeesh Kannan 2022-09-07 13:26:11 +02:00 committed by GitHub
commit 60c050e82b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 752 additions and 208 deletions

View File

@ -6,7 +6,6 @@ requires = [
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.1.0,<8.2.0",
"pathy",
"numpy>=1.15.0",
]
build-backend = "setuptools.build_meta"

View File

@ -1,5 +1,5 @@
# Our libraries
spacy-legacy>=3.0.9,<3.1.0
spacy-legacy>=3.0.10,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
@ -34,4 +34,5 @@ mypy>=0.910,<0.970; platform_machine!='aarch64'
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
types-requests
types-setuptools>=57.0.0
black>=22.0,<23.0

View File

@ -33,7 +33,7 @@ include_package_data = true
python_requires = >=3.6
install_requires =
# Our libraries
spacy-legacy>=3.0.9,<3.1.0
spacy-legacy>=3.0.10,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
@ -42,9 +42,9 @@ install_requires =
wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
# Third-party dependencies
typer>=0.3.0,<0.5.0
pathy>=0.3.5
# Third-party dependencies
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0

View File

@ -31,21 +31,21 @@ def load(
name: Union[str, Path],
*,
vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = util.SimpleFrozenList(),
enable: Iterable[str] = util.SimpleFrozenList(),
exclude: Iterable[str] = util.SimpleFrozenList(),
disable: Union[str, Iterable[str]] = util.SimpleFrozenList(),
enable: Union[str, Iterable[str]] = util.SimpleFrozenList(),
exclude: Union[str, Iterable[str]] = util.SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
) -> Language:
"""Load a spaCy model from an installed package or a local path.
name (str): Package name or model path.
vocab (Vocab): A Vocab object. If True, a vocab is created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (but can be enabled later using nlp.enable_pipe).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.

View File

@ -20,7 +20,7 @@ def download_cli(
ctx: typer.Context,
model: str = Arg(..., help="Name of pipeline package to download"),
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel")
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"),
# fmt: on
):
"""
@ -36,7 +36,12 @@ def download_cli(
download(model, direct, sdist, *ctx.args)
def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None:
def download(
model: str,
direct: bool = False,
sdist: bool = False,
*pip_args,
) -> None:
if (
not (is_package("spacy") or is_package("spacy-nightly"))
and "--no-deps" not in pip_args
@ -50,13 +55,10 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
"dependencies, you'll have to install them manually."
)
pip_args = pip_args + ("--no-deps",)
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
if direct:
components = model.split("-")
model_name = "".join(components[:-1])
version = components[-1]
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
else:
model_name = model
if model in OLD_MODEL_SHORTCUTS:
@ -67,13 +69,26 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
model_name = OLD_MODEL_SHORTCUTS[model]
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
filename = get_model_filename(model_name, version, sdist)
download_model(filename, pip_args)
msg.good(
"Download and installation successful",
f"You can now load the package via spacy.load('{model_name}')",
)
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
dl_tpl = "{m}-{v}/{m}-{v}{s}"
egg_tpl = "#egg={m}=={v}"
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
if sdist:
filename += egg_tpl.format(m=model_name, v=version)
return filename
def get_compatibility() -> dict:
if is_prerelease_version(about.__version__):
version: Optional[str] = about.__version__
@ -105,6 +120,11 @@ def get_version(model: str, comp: dict) -> str:
return comp[model][0]
def get_latest_version(model: str) -> str:
comp = get_compatibility()
return get_version(model, comp)
def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None:

View File

@ -1,10 +1,13 @@
from typing import Optional, Dict, Any, Union, List
import platform
import pkg_resources
import json
from pathlib import Path
from wasabi import Printer, MarkdownRenderer
import srsly
from ._util import app, Arg, Opt, string_to_list
from .download import get_model_filename, get_latest_version
from .. import util
from .. import about
@ -16,6 +19,7 @@ def info_cli(
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
# fmt: on
):
"""
@ -23,10 +27,19 @@ def info_cli(
print its meta information. Flag --markdown prints details in Markdown for easy
copy-pasting to GitHub issues.
Flag --url prints only the download URL of the most recent compatible
version of the pipeline.
DOCS: https://spacy.io/api/cli#info
"""
exclude = string_to_list(exclude)
info(model, markdown=markdown, silent=silent, exclude=exclude)
info(
model,
markdown=markdown,
silent=silent,
exclude=exclude,
url=url,
)
def info(
@ -35,11 +48,20 @@ def info(
markdown: bool = False,
silent: bool = True,
exclude: Optional[List[str]] = None,
url: bool = False,
) -> Union[str, dict]:
msg = Printer(no_print=silent, pretty=not silent)
if not exclude:
exclude = []
if model:
if url:
if model is not None:
title = f"Download info for pipeline '{model}'"
data = info_model_url(model)
print(data["download_url"])
return data
else:
msg.fail("--url option requires a pipeline name", exits=1)
elif model:
title = f"Info about pipeline '{model}'"
data = info_model(model, silent=silent)
else:
@ -99,11 +121,43 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
meta["source"] = str(model_path.resolve())
else:
meta["source"] = str(model_path)
download_url = info_installed_model_url(model)
if download_url:
meta["download_url"] = download_url
return {
k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
}
def info_installed_model_url(model: str) -> Optional[str]:
"""Given a pipeline name, get the download URL if available, otherwise
return None.
This is only available for pipelines installed as modules that have
dist-info available.
"""
try:
dist = pkg_resources.get_distribution(model)
data = json.loads(dist.get_metadata("direct_url.json"))
return data["url"]
except pkg_resources.DistributionNotFound:
# no such package
return None
except Exception:
# something else, like no file or invalid JSON
return None
def info_model_url(model: str) -> Dict[str, Any]:
"""Return the download URL for the latest version of a pipeline."""
version = get_latest_version(model)
filename = get_model_filename(model, version)
download_url = about.__download_url__ + "/" + filename
release_tpl = "https://github.com/explosion/spacy-models/releases/tag/{m}-{v}"
release_url = release_tpl.format(m=model, v=version)
return {"download_url": download_url, "release_url": release_url}
def get_markdown(
data: Dict[str, Any],
title: Optional[str] = None,

View File

@ -230,8 +230,9 @@ class Errors(metaclass=ErrorsWithCodes):
"initialized component.")
E004 = ("Can't set up pipeline component: a factory for '{name}' already "
"exists. Existing factory: {func}. New factory: {new_func}")
E005 = ("Pipeline component '{name}' returned None. If you're using a "
"custom component, maybe you forgot to return the processed Doc?")
E005 = ("Pipeline component '{name}' returned {returned_type} instead of a "
"Doc. If you're using a custom component, maybe you forgot to "
"return the processed Doc?")
E006 = ("Invalid constraints for adding pipeline component. You can only "
"set one of the following: before (component name or index), "
"after (component name or index), first (True) or last (True). "

View File

@ -72,10 +72,10 @@ class CatalanLemmatizer(Lemmatizer):
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(self.lookup_lemmatize(token)[0])
# use lookups, and fall back to the token itself
if not forms:
forms.append(string)
forms.append(lookup_table.get(string, [string])[0])
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms

View File

@ -53,11 +53,16 @@ class FrenchLemmatizer(Lemmatizer):
rules = rules_table.get(univ_pos, [])
string = string.lower()
forms = []
# first try lookup in table based on upos
if string in index:
forms.append(string)
self.cache[cache_key] = forms
return forms
# then add anything in the exceptions table
forms.extend(exceptions.get(string, []))
# if nothing found yet, use the rules
oov_forms = []
if not forms:
for old, new in rules:
@ -69,12 +74,14 @@ class FrenchLemmatizer(Lemmatizer):
forms.append(form)
else:
oov_forms.append(form)
# if still nothing, add the oov forms from rules
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(self.lookup_lemmatize(token)[0])
# use lookups, which fall back to the token itself
if not forms:
forms.append(string)
forms.append(lookup_table.get(string, [string])[0])
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms

18
spacy/lang/la/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from ...language import Language, BaseDefaults
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
class LatinDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Latin(Language):
lang = "la"
Defaults = LatinDefaults
__all__ = ["Latin"]

View File

@ -0,0 +1,34 @@
from ...attrs import LIKE_NUM
import re
# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4
roman_numerals_compile = re.compile(
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
)
_num_words = set(
"""
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
""".split()
)
_ordinal_words = set(
"""
primus prima primum secundus secunda secundum tertius tertia tertium
""".split()
)
def like_num(text):
if text.isdigit():
return True
if roman_numerals_compile.match(text):
return True
if text.lower() in _num_words:
return True
if text.lower() in _ordinal_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,37 @@
# Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin
STOP_WORDS = set(
"""
ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem
cum cur
de deinde dum
ego enim ergo es est et etiam etsi ex
fio
haud hic
iam idem igitur ille in infra inter interim ipse is ita
magis modo mox
nam ne nec necque neque nisi non nos
o ob
per possum post pro
quae quam quare qui quia quicumque quidem quilibet quis quisnam quisquam quisque quisquis quo quoniam
sed si sic sive sub sui sum super suus
tam tamen trans tu tum
ubi uel uero
vel vero
""".split()
)

View File

@ -0,0 +1,76 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
## TODO: Look into systematically handling u/v
_exc = {
"mecum": [{ORTH: "me"}, {ORTH: "cum"}],
"tecum": [{ORTH: "te"}, {ORTH: "cum"}],
"nobiscum": [{ORTH: "nobis"}, {ORTH: "cum"}],
"vobiscum": [{ORTH: "vobis"}, {ORTH: "cum"}],
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
}
for orth in [
"A.",
"Agr.",
"Ap.",
"C.",
"Cn.",
"D.",
"F.",
"K.",
"L.",
"M'.",
"M.",
"Mam.",
"N.",
"Oct.",
"Opet.",
"P.",
"Paul.",
"Post.",
"Pro.",
"Q.",
"S.",
"Ser.",
"Sert.",
"Sex.",
"St.",
"Sta.",
"T.",
"Ti.",
"V.",
"Vol.",
"Vop.",
"U.",
"Uol.",
"Uop.",
"Ian.",
"Febr.",
"Mart.",
"Apr.",
"Mai.",
"Iun.",
"Iul.",
"Aug.",
"Sept.",
"Oct.",
"Nov.",
"Nou.",
"Dec.",
"Non.",
"Id.",
"A.D.",
"Coll.",
"Cos.",
"Ord.",
"Pl.",
"S.C.",
"Suff.",
"Trib.",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1028,8 +1028,8 @@ class Language:
raise ValueError(Errors.E109.format(name=name)) from e
except Exception as e:
error_handler(name, proc, [doc], e)
if doc is None:
raise ValueError(Errors.E005.format(name=name))
if not isinstance(doc, Doc):
raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
return doc
def disable_pipes(self, *names) -> "DisabledPipes":
@ -1063,7 +1063,7 @@ class Language:
"""
if enable is None and disable is None:
raise ValueError(Errors.E991)
if disable is not None and isinstance(disable, str):
if isinstance(disable, str):
disable = [disable]
if enable is not None:
if isinstance(enable, str):
@ -1698,9 +1698,9 @@ class Language:
config: Union[Dict[str, Any], Config] = {},
*,
vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
meta: Dict[str, Any] = SimpleFrozenDict(),
auto_fill: bool = True,
validate: bool = True,
@ -1711,12 +1711,12 @@ class Language:
config (Dict[str, Any] / Config): The loaded config.
vocab (Vocab): A Vocab object. If True, a vocab is created.
disable (Iterable[str]): Names of pipeline components to disable.
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable.
Disabled pipes will be loaded but they won't be run unless you
explicitly enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude.
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude.
Excluded components won't be loaded.
meta (Dict[str, Any]): Meta overrides for nlp.meta.
auto_fill (bool): Automatically fill in missing values in config based
@ -1727,6 +1727,12 @@ class Language:
DOCS: https://spacy.io/api/language#from_config
"""
if isinstance(disable, str):
disable = [disable]
if isinstance(enable, str):
enable = [enable]
if isinstance(exclude, str):
exclude = [exclude]
if auto_fill:
config = Config(
cls.default_config, section_order=CONFIG_SECTION_ORDER
@ -2031,25 +2037,29 @@ class Language:
@staticmethod
def _resolve_component_status(
disable: Iterable[str], enable: Iterable[str], pipe_names: Collection[str]
disable: Union[str, Iterable[str]],
enable: Union[str, Iterable[str]],
pipe_names: Iterable[str],
) -> Tuple[str, ...]:
"""Derives whether (1) `disable` and `enable` values are consistent and (2)
resolves those to a single set of disabled components. Raises an error in
case of inconsistency.
disable (Iterable[str]): Names of components or serialization fields to disable.
enable (Iterable[str]): Names of pipeline components to enable.
disable (Union[str, Iterable[str]]): Name(s) of component(s) or serialization fields to disable.
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable.
pipe_names (Iterable[str]): Names of all pipeline components.
RETURNS (Tuple[str, ...]): Names of components to exclude from pipeline w.r.t.
specified includes and excludes.
"""
if disable is not None and isinstance(disable, str):
if isinstance(disable, str):
disable = [disable]
to_disable = disable
if enable:
if isinstance(enable, str):
enable = [enable]
to_disable = [
pipe_name for pipe_name in pipe_names if pipe_name not in enable
]

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, cython: profile=True
from typing import List
from typing import List, Iterable
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int8_t
@ -868,20 +868,27 @@ class _SetPredicate:
def __call__(self, Token token):
if self.is_extension:
value = get_string_id(token._.get(self.attr))
value = token._.get(self.attr)
else:
value = get_token_attr_for_matcher(token.c, self.attr)
if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
if self.predicate in ("IN", "NOT_IN"):
if isinstance(value, (str, int)):
value = get_string_id(value)
else:
return False
elif self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
# ensure that all values are enclosed in a set
if self.attr == MORPH:
# break up MORPH into individual Feat=Val values
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
elif isinstance(value, (str, int)):
value = set((get_string_id(value),))
elif isinstance(value, Iterable) and all(isinstance(v, (str, int)) for v in value):
value = set(get_string_id(v) for v in value)
else:
# treat a single value as a list
if isinstance(value, (str, int)):
value = set([get_string_id(value)])
else:
value = set(get_string_id(v) for v in value)
return False
if self.predicate == "IN":
return value in self.value
elif self.predicate == "NOT_IN":

View File

@ -256,6 +256,11 @@ def ko_tokenizer_tokenizer():
return nlp.tokenizer
@pytest.fixture(scope="module")
def la_tokenizer():
return get_lang_class("la")().tokenizer
@pytest.fixture(scope="session")
def ko_tokenizer_natto():
pytest.importorskip("natto")

View File

View File

@ -0,0 +1,8 @@
import pytest
def test_la_tokenizer_handles_exc_in_text(la_tokenizer):
text = "scio te omnia facturum, ut nobiscum quam primum sis"
tokens = la_tokenizer(text)
assert len(tokens) == 11
assert tokens[6].text == "nobis"

View File

@ -0,0 +1,35 @@
import pytest
from spacy.lang.la.lex_attrs import like_num
@pytest.mark.parametrize(
"text,match",
[
("IIII", True),
("VI", True),
("vi", True),
("IV", True),
("iv", True),
("IX", True),
("ix", True),
("MMXXII", True),
("0", True),
("1", True),
("quattuor", True),
("decem", True),
("tertius", True),
("canis", False),
("MMXX11", False),
(",", False),
],
)
def test_lex_attrs_like_number(la_tokenizer, text, match):
tokens = la_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match
@pytest.mark.parametrize("word", ["quinque"])
def test_la_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -368,6 +368,16 @@ def test_matcher_intersect_value_operator(en_vocab):
doc[0]._.ext = ["A", "B"]
assert len(matcher(doc)) == 1
# INTERSECTS matches nothing for iterables that aren't all str or int
matcher = Matcher(en_vocab)
pattern = [{"_": {"ext": {"INTERSECTS": ["Abx", "C"]}}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0]._.ext = [["Abx"], "B"]
assert len(matcher(doc)) == 0
doc[0]._.ext = ["Abx", "B"]
assert len(matcher(doc)) == 1
# INTERSECTS with an empty pattern list matches nothing
matcher = Matcher(en_vocab)
pattern = [{"_": {"ext": {"INTERSECTS": []}}}]
@ -476,14 +486,22 @@ def test_matcher_extension_set_membership(en_vocab):
assert len(matches) == 0
@pytest.mark.xfail(reason="IN predicate must handle sequence values in extensions")
def test_matcher_extension_in_set_predicate(en_vocab):
matcher = Matcher(en_vocab)
Token.set_extension("ext", default=[])
pattern = [{"_": {"ext": {"IN": ["A", "C"]}}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
# The IN predicate expects an exact match between the
# extension value and one of the pattern's values.
doc[0]._.ext = ["A", "B"]
assert len(matcher(doc)) == 0
doc[0]._.ext = ["A"]
assert len(matcher(doc)) == 0
doc[0]._.ext = "A"
assert len(matcher(doc)) == 1

View File

@ -17,6 +17,7 @@ def test_build_dependencies():
"types-dataclasses",
"types-mock",
"types-requests",
"types-setuptools",
]
# ignore language-specific packages that shouldn't be installed by all
libs_ignore_setup = [

View File

@ -618,6 +618,7 @@ def test_load_disable_enable() -> None:
base_nlp.to_disk(tmp_dir)
to_disable = ["parser", "tagger"]
to_enable = ["tagger", "parser"]
single_str = "tagger"
# Setting only `disable`.
nlp = spacy.load(tmp_dir, disable=to_disable)
@ -632,6 +633,16 @@ def test_load_disable_enable() -> None:
]
)
# Loading with a string representing one component
nlp = spacy.load(tmp_dir, exclude=single_str)
assert single_str not in nlp.component_names
nlp = spacy.load(tmp_dir, disable=single_str)
assert single_str in nlp.component_names
assert single_str not in nlp.pipe_names
assert nlp._disabled == {single_str}
assert nlp.disabled == [single_str]
# Testing consistent enable/disable combination.
nlp = spacy.load(
tmp_dir,

View File

@ -670,3 +670,25 @@ def test_dot_in_factory_names(nlp):
with pytest.raises(ValueError, match="not permitted"):
Language.factory("my.evil.component.v1", func=evil_component)
def test_component_return():
"""Test that an error is raised if components return a type other than a
doc."""
nlp = English()
@Language.component("test_component_good_pipe")
def good_pipe(doc):
return doc
nlp.add_pipe("test_component_good_pipe")
nlp("text")
nlp.remove_pipe("test_component_good_pipe")
@Language.component("test_component_bad_pipe")
def bad_pipe(doc):
return doc.text
nlp.add_pipe("test_component_bad_pipe")
with pytest.raises(ValueError, match="instead of a Doc"):
nlp("text")

View File

@ -10,7 +10,8 @@ from spacy.ml._precomputable_affine import _backprop_precomputable_affine_paddin
from spacy.util import dot_to_object, SimpleFrozenList, import_file
from spacy.util import to_ternary_int
from thinc.api import Config, Optimizer, ConfigValidationError
from thinc.api import set_current_ops
from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
from spacy.training.batchers import minibatch_by_words
from spacy.lang.en import English
from spacy.lang.nl import Dutch
@ -18,7 +19,6 @@ from spacy.language import DEFAULT_CONFIG_PATH
from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema
from pydantic import ValidationError
from thinc.api import get_current_ops, NumpyOps, CupyOps
from .util import get_random_doc, make_tempdir
@ -111,26 +111,25 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
def test_prefer_gpu():
current_ops = get_current_ops()
try:
import cupy # noqa: F401
prefer_gpu()
if has_cupy_gpu:
assert prefer_gpu()
assert isinstance(get_current_ops(), CupyOps)
except ImportError:
elif has_torch_mps_gpu:
assert prefer_gpu()
assert isinstance(get_current_ops(), MPSOps)
else:
assert not prefer_gpu()
set_current_ops(current_ops)
def test_require_gpu():
current_ops = get_current_ops()
try:
import cupy # noqa: F401
if has_cupy_gpu:
require_gpu()
assert isinstance(get_current_ops(), CupyOps)
except ImportError:
with pytest.raises(ValueError):
require_gpu()
elif has_torch_mps_gpu:
require_gpu()
assert isinstance(get_current_ops(), MPSOps)
set_current_ops(current_ops)

View File

@ -0,0 +1,30 @@
import pytest
import spacy
from spacy.training import loggers
@pytest.fixture()
def nlp():
nlp = spacy.blank("en")
nlp.add_pipe("ner")
return nlp
@pytest.fixture()
def info():
return {
"losses": {"ner": 100},
"other_scores": {"ENTS_F": 0.85, "ENTS_P": 0.90, "ENTS_R": 0.80},
"epoch": 100,
"step": 125,
"score": 85,
}
def test_console_logger(nlp, info):
console_logger = loggers.console_logger(
progress_bar=True, console_output=True, output_file=None
)
log_step, finalize = console_logger(nlp)
log_step(info)

View File

@ -1,4 +1,4 @@
from typing import Any, Dict, Iterable
from typing import Any, Dict, Iterable, Optional
from .doc import Doc
from .span import Span
@ -24,4 +24,4 @@ class SpanGroup:
def __getitem__(self, i: int) -> Span: ...
def to_bytes(self) -> bytes: ...
def from_bytes(self, bytes_data: bytes) -> SpanGroup: ...
def copy(self) -> SpanGroup: ...
def copy(self, doc: Optional[Doc] = ...) -> SpanGroup: ...

View File

@ -244,15 +244,18 @@ cdef class SpanGroup:
cdef void push_back(self, const shared_ptr[SpanC] &span):
self.c.push_back(span)
def copy(self) -> SpanGroup:
def copy(self, doc: Optional["Doc"] = None) -> SpanGroup:
"""Clones the span group.
doc (Doc): New reference document to which the copy is bound.
RETURNS (SpanGroup): A copy of the span group.
DOCS: https://spacy.io/api/spangroup#copy
"""
if doc is None:
doc = self.doc
return SpanGroup(
self.doc,
doc,
name=self.name,
attrs=deepcopy(self.attrs),
spans=list(self),

View File

@ -42,7 +42,8 @@ class SpanGroups(UserDict):
def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups":
if doc is None:
doc = self._ensure_doc()
return SpanGroups(doc).from_bytes(self.to_bytes())
data_copy = ((k, v.copy(doc=doc)) for k, v in self.items())
return SpanGroups(doc, items=data_copy)
def setdefault(self, key, default=None):
if not isinstance(default, SpanGroup):

View File

@ -1,10 +1,13 @@
from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO
from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO, Union
from wasabi import Printer
from pathlib import Path
import tqdm
import sys
import srsly
from ..util import registry
from ..errors import Errors
from .. import util
if TYPE_CHECKING:
from ..language import Language # noqa: F401
@ -23,13 +26,44 @@ def setup_table(
return final_cols, final_widths, ["r" for _ in final_widths]
@registry.loggers("spacy.ConsoleLogger.v1")
def console_logger(progress_bar: bool = False):
@registry.loggers("spacy.ConsoleLogger.v2")
def console_logger(
progress_bar: bool = False,
console_output: bool = True,
output_file: Optional[Union[str, Path]] = None,
):
"""The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
progress_bar (bool): Whether the logger should print the progress bar.
console_output (bool): Whether the logger should print the logs on the console.
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
"""
_log_exist = False
if output_file:
output_file = util.ensure_path(output_file) # type: ignore
if output_file.exists(): # type: ignore
_log_exist = True
if not output_file.parents[0].exists(): # type: ignore
output_file.parents[0].mkdir(parents=True) # type: ignore
def setup_printer(
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
write = lambda text: print(text, file=stdout, flush=True)
msg = Printer(no_print=True)
nonlocal output_file
output_stream = None
if _log_exist:
write(
msg.warn(
f"Saving logs is disabled because {output_file} already exists."
)
)
output_file = None
elif output_file:
write(msg.info(f"Saving results to {output_file}"))
output_stream = open(output_file, "w", encoding="utf-8")
# ensure that only trainable components are logged
logged_pipes = [
name
@ -40,13 +74,15 @@ def console_logger(progress_bar: bool = False):
score_weights = nlp.config["training"]["score_weights"]
score_cols = [col for col, value in score_weights.items() if value is not None]
loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
spacing = 2
table_header, table_widths, table_aligns = setup_table(
cols=["E", "#"] + loss_cols + score_cols + ["Score"],
widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
)
write(msg.row(table_header, widths=table_widths, spacing=spacing))
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
if console_output:
spacing = 2
table_header, table_widths, table_aligns = setup_table(
cols=["E", "#"] + loss_cols + score_cols + ["Score"],
widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
)
write(msg.row(table_header, widths=table_widths, spacing=spacing))
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
progress = None
def log_step(info: Optional[Dict[str, Any]]) -> None:
@ -57,12 +93,15 @@ def console_logger(progress_bar: bool = False):
if progress is not None:
progress.update(1)
return
losses = [
"{0:.2f}".format(float(info["losses"][pipe_name]))
for pipe_name in logged_pipes
]
losses = []
log_losses = {}
for pipe_name in logged_pipes:
losses.append("{0:.2f}".format(float(info["losses"][pipe_name])))
log_losses[pipe_name] = float(info["losses"][pipe_name])
scores = []
log_scores = {}
for col in score_cols:
score = info["other_scores"].get(col, 0.0)
try:
@ -73,6 +112,7 @@ def console_logger(progress_bar: bool = False):
if col != "speed":
score *= 100
scores.append("{0:.2f}".format(score))
log_scores[str(col)] = score
data = (
[info["epoch"], info["step"]]
@ -80,20 +120,36 @@ def console_logger(progress_bar: bool = False):
+ scores
+ ["{0:.2f}".format(float(info["score"]))]
)
if output_stream:
# Write to log file per log_step
log_data = {
"epoch": info["epoch"],
"step": info["step"],
"losses": log_losses,
"scores": log_scores,
"score": float(info["score"]),
}
output_stream.write(srsly.json_dumps(log_data) + "\n")
if progress is not None:
progress.close()
write(
msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing)
)
if progress_bar:
# Set disable=None, so that it disables on non-TTY
progress = tqdm.tqdm(
total=eval_frequency, disable=None, leave=False, file=stderr
if console_output:
write(
msg.row(
data, widths=table_widths, aligns=table_aligns, spacing=spacing
)
)
progress.set_description(f"Epoch {info['epoch']+1}")
if progress_bar:
# Set disable=None, so that it disables on non-TTY
progress = tqdm.tqdm(
total=eval_frequency, disable=None, leave=False, file=stderr
)
progress.set_description(f"Epoch {info['epoch']+1}")
def finalize() -> None:
pass
if output_stream:
output_stream.close()
return log_step, finalize

View File

@ -398,9 +398,9 @@ def load_model(
name: Union[str, Path],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a package or data path.
@ -408,9 +408,9 @@ def load_model(
name (str): Package name or model path.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
enable (Iterable[str]): Names of pipeline components to enable. All others will be disabled.
exclude (Iterable[str]): Names of pipeline components to exclude.
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable.
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All others will be disabled.
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
@ -440,9 +440,9 @@ def load_model_from_package(
name: str,
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from an installed package.
@ -450,12 +450,12 @@ def load_model_from_package(
name (str): The package name.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
@ -470,9 +470,9 @@ def load_model_from_path(
*,
meta: Optional[Dict[str, Any]] = None,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a data directory path. Creates Language class with
@ -482,12 +482,12 @@ def load_model_from_path(
meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
@ -516,9 +516,9 @@ def load_model_from_config(
*,
meta: Dict[str, Any] = SimpleFrozenDict(),
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
auto_fill: bool = False,
validate: bool = True,
) -> "Language":
@ -529,12 +529,12 @@ def load_model_from_config(
meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded.
auto_fill (bool): Whether to auto-fill config with missing defaults.
validate (bool): Whether to show config validation errors.
@ -616,9 +616,9 @@ def load_model_from_init_py(
init_file: Union[Path, str],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
enable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Helper function to use in the `load()` method of a model package's
@ -626,12 +626,12 @@ def load_model_from_init_py(
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
enable (Iterable[str]): Names of pipeline components to enable. All other
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.

View File

@ -77,14 +77,15 @@ $ python -m spacy info [--markdown] [--silent] [--exclude]
$ python -m spacy info [model] [--markdown] [--silent] [--exclude]
```
| Name | Description |
| ------------------------------------------------ | --------------------------------------------------------------------------------------------- |
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~ |
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
| `--exclude`, `-e` | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Information about your spaCy installation. |
| Name | Description |
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------- |
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~ |
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
| `--exclude`, `-e` | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ |
| `--url`, `-u` <Tag variant="new">3.5.0</Tag> | Print the URL to download the most recent compatible version of the pipeline. Requires a pipeline name. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Information about your spaCy installation. |
## validate {#validate new="2" tag="command"}

View File

@ -63,17 +63,18 @@ spaCy loads a model under the hood based on its
> nlp = Language.from_config(config)
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
| _keyword-only_ | |
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The initialized object. ~~Language~~ |
| Name | Description |
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
| _keyword-only_ | |
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The initialized object. ~~Language~~ |
## Language.component {#component tag="classmethod" new="3"}
@ -695,8 +696,8 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------ |
| _keyword-only_ | |
| `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ |
| `enable` | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
| `disable` | Name(s) of pipeline component(s) to disable. ~~Optional[Union[str, Iterable[str]]]~~ |
| `enable` | Name(s) of pipeline component(s) that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
| **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ |
## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}

View File

@ -248,6 +248,59 @@ added to an existing vectors table. See more details in
## Loggers {#loggers}
These functions are available from `@spacy.registry.loggers`.
### spacy.ConsoleLogger.v1 {#ConsoleLogger_v1}
> #### Example config
>
> ```ini
> [training.logger]
> @loggers = "spacy.ConsoleLogger.v1"
> progress_bar = true
> ```
Writes the results of a training step to the console in a tabular format.
<Accordion title="Example console output" spaced>
```cli
$ python -m spacy train config.cfg
```
```
Using CPU
Loading config and nlp from: config.cfg
Pipeline: ['tok2vec', 'tagger']
Start training
Training. Initial learn rate: 0.0
E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE
--- ------ ------------ ----------- ------- ------
0 0 0.00 86.20 0.22 0.00
0 200 3.08 18968.78 34.00 0.34
0 400 31.81 22539.06 33.64 0.34
0 600 92.13 22794.91 43.80 0.44
0 800 183.62 21541.39 56.05 0.56
0 1000 352.49 25461.82 65.15 0.65
0 1200 422.87 23708.82 71.84 0.72
0 1400 601.92 24994.79 76.57 0.77
0 1600 662.57 22268.02 80.20 0.80
0 1800 1101.50 28413.77 82.56 0.83
0 2000 1253.43 28736.36 85.00 0.85
0 2200 1411.02 28237.53 87.42 0.87
0 2400 1605.35 28439.95 88.70 0.89
```
Note that the cumulative loss keeps increasing within one epoch, but should
start decreasing across epochs.
</Accordion>
| Name | Description |
| -------------- | --------------------------------------------------------- |
| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
Logging utilities for spaCy are implemented in the
[`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the
functions are typically available from `@spacy.registry.loggers`.

View File

@ -255,9 +255,10 @@ Return a copy of the span group.
> new_group = doc.spans["errors"].copy()
> ```
| Name | Description |
| ----------- | ----------------------------------------------- |
| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
| Name | Description |
| ----------- | -------------------------------------------------------------------------------------------------- |
| `doc` | The document to which the copy is bound. Defaults to `None` for the current doc. ~~Optional[Doc]~~ |
| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
## SpanGroup.to_bytes {#to_bytes tag="method"}

View File

@ -45,16 +45,16 @@ specified separately using the new `exclude` keyword argument.
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
> ```
| Name | Description |
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
| _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
| `enable` | Names of pipeline components to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
| Name | Description |
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
| _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ |
| `exclude` <Tag variant="new">3</Tag> | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
@ -275,8 +275,8 @@ Render a dependency parse tree or named entity visualization.
### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"}
Generate dependency parse in `{'words': [], 'arcs': []}` format.
For use with the `manual=True` argument in `displacy.render`.
Generate dependency parse in `{'words': [], 'arcs': []}` format. For use with
the `manual=True` argument in `displacy.render`.
> #### Example
>
@ -297,8 +297,8 @@ For use with the `manual=True` argument in `displacy.render`.
### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"}
Generate named entities in `[{start: i, end: i, label: 'label'}]` format.
For use with the `manual=True` argument in `displacy.render`.
Generate named entities in `[{start: i, end: i, label: 'label'}]` format. For
use with the `manual=True` argument in `displacy.render`.
> #### Example
>
@ -319,8 +319,8 @@ For use with the `manual=True` argument in `displacy.render`.
### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"}
Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format.
For use with the `manual=True` argument in `displacy.render`.
Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format. For
use with the `manual=True` argument in `displacy.render`.
> #### Example
>
@ -451,7 +451,7 @@ factories.
| Registry name | Description |
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. |
| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. |
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
@ -505,7 +505,7 @@ finished. To log each training step, a
and the accuracy scores on the development set.
The built-in, default logger is the ConsoleLogger, which prints results to the
console in tabular format. The
console in tabular format and saves them to a `jsonl` file. The
[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
a dependency of spaCy, enables other loggers, such as one that sends results to
a [Weights & Biases](https://www.wandb.com/) dashboard.
@ -513,16 +513,20 @@ a [Weights & Biases](https://www.wandb.com/) dashboard.
Instead of using one of the built-in loggers, you can
[implement your own](/usage/training#custom-logging).
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"}
> #### Example config
>
> ```ini
> [training.logger]
> @loggers = "spacy.ConsoleLogger.v1"
> @loggers = "spacy.ConsoleLogger.v2"
> progress_bar = true
> console_output = true
> output_file = "training_log.jsonl"
> ```
Writes the results of a training step to the console in a tabular format.
Writes the results of a training step to the console in a tabular format and
saves them to a `jsonl` file.
<Accordion title="Example console output" spaced>
@ -536,22 +540,23 @@ $ python -m spacy train config.cfg
Pipeline: ['tok2vec', 'tagger']
Start training
Training. Initial learn rate: 0.0
Saving results to training_log.jsonl
E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE
--- ------ ------------ ----------- ------- ------
1 0 0.00 86.20 0.22 0.00
1 200 3.08 18968.78 34.00 0.34
1 400 31.81 22539.06 33.64 0.34
1 600 92.13 22794.91 43.80 0.44
1 800 183.62 21541.39 56.05 0.56
1 1000 352.49 25461.82 65.15 0.65
1 1200 422.87 23708.82 71.84 0.72
1 1400 601.92 24994.79 76.57 0.77
1 1600 662.57 22268.02 80.20 0.80
1 1800 1101.50 28413.77 82.56 0.83
1 2000 1253.43 28736.36 85.00 0.85
1 2200 1411.02 28237.53 87.42 0.87
1 2400 1605.35 28439.95 88.70 0.89
0 0 0.00 86.20 0.22 0.00
0 200 3.08 18968.78 34.00 0.34
0 400 31.81 22539.06 33.64 0.34
0 600 92.13 22794.91 43.80 0.44
0 800 183.62 21541.39 56.05 0.56
0 1000 352.49 25461.82 65.15 0.65
0 1200 422.87 23708.82 71.84 0.72
0 1400 601.92 24994.79 76.57 0.77
0 1600 662.57 22268.02 80.20 0.80
0 1800 1101.50 28413.77 82.56 0.83
0 2000 1253.43 28736.36 85.00 0.85
0 2200 1411.02 28237.53 87.42 0.87
0 2400 1605.35 28439.95 88.70 0.89
```
Note that the cumulative loss keeps increasing within one epoch, but should
@ -559,6 +564,12 @@ start decreasing across epochs.
</Accordion>
| Name | Description |
| ---------------- | --------------------------------------------------------------------- |
| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ |
| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ |
## Readers {#readers}
### File readers {#file-readers source="github.com/explosion/srsly" new="3"}
@ -1038,15 +1049,16 @@ and create a `Language` object. The model data will then be loaded in via
> nlp = util.load_model("/path/to/data")
> ```
| Name | Description |
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | Package name or path. ~~str~~ |
| _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
| Name | Description |
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | Package name or path. ~~str~~ |
| _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
@ -1062,15 +1074,16 @@ A helper function to use in the `load()` method of a pipeline package's
> return load_model_from_init_py(__file__, **overrides)
> ```
| Name | Description |
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
| _keyword-only_ | |
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
| Name | Description |
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
| _keyword-only_ | |
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `exclude` <Tag variant="new">3</Tag> | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
### util.load_config {#util.load_config tag="function" new="3"}

View File

@ -396,15 +396,32 @@ pipeline package can be found.
To download a trained pipeline directly using
[pip](https://pypi.python.org/pypi/pip), point `pip install` to the URL or local
path of the wheel file or archive. Installing the wheel is usually more
efficient. To find the direct link to a package, head over to the
[releases](https://github.com/explosion/spacy-models/releases), right click on
the archive link and copy it to your clipboard.
efficient.
> #### Pipeline Package URLs {#pipeline-urls}
>
> Pretrained pipeline distributions are hosted on
> [Github Releases](https://github.com/explosion/spacy-models/releases), and you
> can find download links there, as well as on the model page. You can also get
> URLs directly from the command line by using `spacy info` with the `--url`
> flag, which may be useful for automation.
>
> ```bash
> spacy info en_core_web_sm --url
> ```
>
> This command will print the URL for the latest version of a pipeline
> compatible with the version of spaCy you're using. Note that in order to look
> up the compatibility information an internet connection is required.
```bash
# With external URL
$ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl
$ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
# Using spacy info to get the external URL
$ pip install $(spacy info en_core_web_sm --url)
# With local file
$ pip install /Users/you/en_core_web_sm-3.0.0-py3-none-any.whl
$ pip install /Users/you/en_core_web_sm-3.0.0.tar.gz
@ -545,21 +562,16 @@ should be specifying them directly.
Because pipeline packages are valid Python packages, you can add them to your
application's `requirements.txt`. If you're running your own internal PyPi
installation, you can upload the pipeline packages there. pip's
[requirements file format](https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format)
supports both package names to download via a PyPi server, as well as direct
URLs.
[requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/)
supports both package names to download via a PyPi server, as well as
[direct URLs](#pipeline-urls).
```text
### requirements.txt
spacy>=3.0.0,<4.0.0
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl
```
Specifying `#egg=` with the package name tells pip which package to expect from
the download URL. This way, the package won't be re-downloaded and overwritten
if it's already installed - just like when you're downloading a package from
PyPi.
All pipeline packages are versioned and specify their spaCy dependency. This
ensures cross-compatibility and lets you specify exact version requirements for
each pipeline. If you've [trained](/usage/training) your own pipeline, you can

View File

@ -1192,7 +1192,7 @@
"slogan": "Fast, flexible and transparent sentiment analysis",
"description": "Asent is a rule-based sentiment analysis library for Python made using spaCy. It is inspired by VADER, but uses a more modular ruleset, that allows the user to change e.g. the method for finding negations. Furthermore it includes visualisers to visualize the model predictions, making the model easily interpretable.",
"github": "kennethenevoldsen/asent",
"pip": "aseny",
"pip": "asent",
"code_example": [
"import spacy",
"import asent",

View File

@ -76,6 +76,7 @@ const MODEL_META = {
benchmark_ner: 'NER accuracy',
benchmark_speed: 'Speed',
compat: 'Latest compatible package version for your spaCy installation',
download_link: 'Download link for the pipeline',
}
const LABEL_SCHEME_META = {
@ -138,6 +139,13 @@ function formatAccuracy(data, lang) {
.filter(item => item)
}
function formatDownloadLink(lang, name, version) {
const fullName = `${lang}_${name}-${version}`
const filename = `${fullName}-py3-none-any.whl`
const url = `https://github.com/explosion/spacy-models/releases/download/${fullName}/${filename}`
return <Link to={url} hideIcon>{filename}</Link>
}
function formatModelMeta(data) {
return {
fullName: `${data.lang}_${data.name}-${data.version}`,
@ -154,6 +162,7 @@ function formatModelMeta(data) {
labels: isEmptyObj(data.labels) ? null : data.labels,
vectors: formatVectors(data.vectors),
accuracy: formatAccuracy(data.performance, data.lang),
download_link: formatDownloadLink(data.lang, data.name, data.version),
}
}
@ -244,6 +253,7 @@ const Model = ({
{ label: 'Components', content: components, help: MODEL_META.components },
{ label: 'Pipeline', content: pipeline, help: MODEL_META.pipeline },
{ label: 'Vectors', content: meta.vectors, help: MODEL_META.vecs },
{ label: 'Download Link', content: meta.download_link, help: MODEL_META.download_link },
{ label: 'Sources', content: sources, help: MODEL_META.sources },
{ label: 'Author', content: author },
{ label: 'License', content: license },