mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Merge pull request #11448 from shadeMe/merge-develop-into-v4
Merge `develop` into `v4`
This commit is contained in:
commit
60c050e82b
|
@ -6,7 +6,6 @@ requires = [
|
|||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.1.0,<8.2.0",
|
||||
"pathy",
|
||||
"numpy>=1.15.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Our libraries
|
||||
spacy-legacy>=3.0.9,<3.1.0
|
||||
spacy-legacy>=3.0.10,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
|
@ -34,4 +34,5 @@ mypy>=0.910,<0.970; platform_machine!='aarch64'
|
|||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-requests
|
||||
types-setuptools>=57.0.0
|
||||
black>=22.0,<23.0
|
||||
|
|
|
@ -33,7 +33,7 @@ include_package_data = true
|
|||
python_requires = >=3.6
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.9,<3.1.0
|
||||
spacy-legacy>=3.0.10,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
|
@ -42,9 +42,9 @@ install_requires =
|
|||
wasabi>=0.9.1,<1.1.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
# Third-party dependencies
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
# Third-party dependencies
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
|
|
@ -31,21 +31,21 @@ def load(
|
|||
name: Union[str, Path],
|
||||
*,
|
||||
vocab: Union[Vocab, bool] = True,
|
||||
disable: Iterable[str] = util.SimpleFrozenList(),
|
||||
enable: Iterable[str] = util.SimpleFrozenList(),
|
||||
exclude: Iterable[str] = util.SimpleFrozenList(),
|
||||
disable: Union[str, Iterable[str]] = util.SimpleFrozenList(),
|
||||
enable: Union[str, Iterable[str]] = util.SimpleFrozenList(),
|
||||
exclude: Union[str, Iterable[str]] = util.SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
|
||||
) -> Language:
|
||||
"""Load a spaCy model from an installed package or a local path.
|
||||
|
||||
name (str): Package name or model path.
|
||||
vocab (Vocab): A Vocab object. If True, a vocab is created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
|
||||
pipes will be disabled (but can be enabled later using nlp.enable_pipe).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
|
|
|
@ -20,7 +20,7 @@ def download_cli(
|
|||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Name of pipeline package to download"),
|
||||
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
||||
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel")
|
||||
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -36,7 +36,12 @@ def download_cli(
|
|||
download(model, direct, sdist, *ctx.args)
|
||||
|
||||
|
||||
def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None:
|
||||
def download(
|
||||
model: str,
|
||||
direct: bool = False,
|
||||
sdist: bool = False,
|
||||
*pip_args,
|
||||
) -> None:
|
||||
if (
|
||||
not (is_package("spacy") or is_package("spacy-nightly"))
|
||||
and "--no-deps" not in pip_args
|
||||
|
@ -50,13 +55,10 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
|
|||
"dependencies, you'll have to install them manually."
|
||||
)
|
||||
pip_args = pip_args + ("--no-deps",)
|
||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
|
||||
if direct:
|
||||
components = model.split("-")
|
||||
model_name = "".join(components[:-1])
|
||||
version = components[-1]
|
||||
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
|
||||
else:
|
||||
model_name = model
|
||||
if model in OLD_MODEL_SHORTCUTS:
|
||||
|
@ -67,13 +69,26 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
|
|||
model_name = OLD_MODEL_SHORTCUTS[model]
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
|
||||
|
||||
filename = get_model_filename(model_name, version, sdist)
|
||||
|
||||
download_model(filename, pip_args)
|
||||
msg.good(
|
||||
"Download and installation successful",
|
||||
f"You can now load the package via spacy.load('{model_name}')",
|
||||
)
|
||||
|
||||
|
||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||
egg_tpl = "#egg={m}=={v}"
|
||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
||||
if sdist:
|
||||
filename += egg_tpl.format(m=model_name, v=version)
|
||||
return filename
|
||||
|
||||
|
||||
def get_compatibility() -> dict:
|
||||
if is_prerelease_version(about.__version__):
|
||||
version: Optional[str] = about.__version__
|
||||
|
@ -105,6 +120,11 @@ def get_version(model: str, comp: dict) -> str:
|
|||
return comp[model][0]
|
||||
|
||||
|
||||
def get_latest_version(model: str) -> str:
|
||||
comp = get_compatibility()
|
||||
return get_version(model, comp)
|
||||
|
||||
|
||||
def download_model(
|
||||
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
||||
) -> None:
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
import platform
|
||||
import pkg_resources
|
||||
import json
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer
|
||||
import srsly
|
||||
|
||||
from ._util import app, Arg, Opt, string_to_list
|
||||
from .download import get_model_filename, get_latest_version
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
@ -16,6 +19,7 @@ def info_cli(
|
|||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
|
||||
url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -23,10 +27,19 @@ def info_cli(
|
|||
print its meta information. Flag --markdown prints details in Markdown for easy
|
||||
copy-pasting to GitHub issues.
|
||||
|
||||
Flag --url prints only the download URL of the most recent compatible
|
||||
version of the pipeline.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#info
|
||||
"""
|
||||
exclude = string_to_list(exclude)
|
||||
info(model, markdown=markdown, silent=silent, exclude=exclude)
|
||||
info(
|
||||
model,
|
||||
markdown=markdown,
|
||||
silent=silent,
|
||||
exclude=exclude,
|
||||
url=url,
|
||||
)
|
||||
|
||||
|
||||
def info(
|
||||
|
@ -35,11 +48,20 @@ def info(
|
|||
markdown: bool = False,
|
||||
silent: bool = True,
|
||||
exclude: Optional[List[str]] = None,
|
||||
url: bool = False,
|
||||
) -> Union[str, dict]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
if not exclude:
|
||||
exclude = []
|
||||
if model:
|
||||
if url:
|
||||
if model is not None:
|
||||
title = f"Download info for pipeline '{model}'"
|
||||
data = info_model_url(model)
|
||||
print(data["download_url"])
|
||||
return data
|
||||
else:
|
||||
msg.fail("--url option requires a pipeline name", exits=1)
|
||||
elif model:
|
||||
title = f"Info about pipeline '{model}'"
|
||||
data = info_model(model, silent=silent)
|
||||
else:
|
||||
|
@ -99,11 +121,43 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
|||
meta["source"] = str(model_path.resolve())
|
||||
else:
|
||||
meta["source"] = str(model_path)
|
||||
download_url = info_installed_model_url(model)
|
||||
if download_url:
|
||||
meta["download_url"] = download_url
|
||||
return {
|
||||
k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
|
||||
}
|
||||
|
||||
|
||||
def info_installed_model_url(model: str) -> Optional[str]:
|
||||
"""Given a pipeline name, get the download URL if available, otherwise
|
||||
return None.
|
||||
|
||||
This is only available for pipelines installed as modules that have
|
||||
dist-info available.
|
||||
"""
|
||||
try:
|
||||
dist = pkg_resources.get_distribution(model)
|
||||
data = json.loads(dist.get_metadata("direct_url.json"))
|
||||
return data["url"]
|
||||
except pkg_resources.DistributionNotFound:
|
||||
# no such package
|
||||
return None
|
||||
except Exception:
|
||||
# something else, like no file or invalid JSON
|
||||
return None
|
||||
|
||||
def info_model_url(model: str) -> Dict[str, Any]:
|
||||
"""Return the download URL for the latest version of a pipeline."""
|
||||
version = get_latest_version(model)
|
||||
|
||||
filename = get_model_filename(model, version)
|
||||
download_url = about.__download_url__ + "/" + filename
|
||||
release_tpl = "https://github.com/explosion/spacy-models/releases/tag/{m}-{v}"
|
||||
release_url = release_tpl.format(m=model, v=version)
|
||||
return {"download_url": download_url, "release_url": release_url}
|
||||
|
||||
|
||||
def get_markdown(
|
||||
data: Dict[str, Any],
|
||||
title: Optional[str] = None,
|
||||
|
|
|
@ -230,8 +230,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"initialized component.")
|
||||
E004 = ("Can't set up pipeline component: a factory for '{name}' already "
|
||||
"exists. Existing factory: {func}. New factory: {new_func}")
|
||||
E005 = ("Pipeline component '{name}' returned None. If you're using a "
|
||||
"custom component, maybe you forgot to return the processed Doc?")
|
||||
E005 = ("Pipeline component '{name}' returned {returned_type} instead of a "
|
||||
"Doc. If you're using a custom component, maybe you forgot to "
|
||||
"return the processed Doc?")
|
||||
E006 = ("Invalid constraints for adding pipeline component. You can only "
|
||||
"set one of the following: before (component name or index), "
|
||||
"after (component name or index), first (True) or last (True). "
|
||||
|
|
|
@ -72,10 +72,10 @@ class CatalanLemmatizer(Lemmatizer):
|
|||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup_table.keys():
|
||||
forms.append(self.lookup_lemmatize(token)[0])
|
||||
|
||||
# use lookups, and fall back to the token itself
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
forms.append(lookup_table.get(string, [string])[0])
|
||||
forms = list(dict.fromkeys(forms))
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
|
|
@ -53,11 +53,16 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
rules = rules_table.get(univ_pos, [])
|
||||
string = string.lower()
|
||||
forms = []
|
||||
# first try lookup in table based on upos
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
||||
# then add anything in the exceptions table
|
||||
forms.extend(exceptions.get(string, []))
|
||||
|
||||
# if nothing found yet, use the rules
|
||||
oov_forms = []
|
||||
if not forms:
|
||||
for old, new in rules:
|
||||
|
@ -69,12 +74,14 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
|
||||
# if still nothing, add the oov forms from rules
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup_table.keys():
|
||||
forms.append(self.lookup_lemmatize(token)[0])
|
||||
|
||||
# use lookups, which fall back to the token itself
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
forms.append(lookup_table.get(string, [string])[0])
|
||||
forms = list(dict.fromkeys(forms))
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
|
18
spacy/lang/la/__init__.py
Normal file
18
spacy/lang/la/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from ...language import Language, BaseDefaults
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
|
||||
class LatinDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Latin(Language):
|
||||
lang = "la"
|
||||
Defaults = LatinDefaults
|
||||
|
||||
|
||||
__all__ = ["Latin"]
|
34
spacy/lang/la/lex_attrs.py
Normal file
34
spacy/lang/la/lex_attrs.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
import re
|
||||
|
||||
# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4
|
||||
roman_numerals_compile = re.compile(
|
||||
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
|
||||
)
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
|
||||
""".split()
|
||||
)
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
primus prima primum secundus secunda secundum tertius tertia tertium
|
||||
""".split()
|
||||
)
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.isdigit():
|
||||
return True
|
||||
if roman_numerals_compile.match(text):
|
||||
return True
|
||||
if text.lower() in _num_words:
|
||||
return True
|
||||
if text.lower() in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
37
spacy/lang/la/stop_words.py
Normal file
37
spacy/lang/la/stop_words.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
# Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem
|
||||
|
||||
cum cur
|
||||
|
||||
de deinde dum
|
||||
|
||||
ego enim ergo es est et etiam etsi ex
|
||||
|
||||
fio
|
||||
|
||||
haud hic
|
||||
|
||||
iam idem igitur ille in infra inter interim ipse is ita
|
||||
|
||||
magis modo mox
|
||||
|
||||
nam ne nec necque neque nisi non nos
|
||||
|
||||
o ob
|
||||
|
||||
per possum post pro
|
||||
|
||||
quae quam quare qui quia quicumque quidem quilibet quis quisnam quisquam quisque quisquis quo quoniam
|
||||
|
||||
sed si sic sive sub sui sum super suus
|
||||
|
||||
tam tamen trans tu tum
|
||||
|
||||
ubi uel uero
|
||||
|
||||
vel vero
|
||||
""".split()
|
||||
)
|
76
spacy/lang/la/tokenizer_exceptions.py
Normal file
76
spacy/lang/la/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
## TODO: Look into systematically handling u/v
|
||||
_exc = {
|
||||
"mecum": [{ORTH: "me"}, {ORTH: "cum"}],
|
||||
"tecum": [{ORTH: "te"}, {ORTH: "cum"}],
|
||||
"nobiscum": [{ORTH: "nobis"}, {ORTH: "cum"}],
|
||||
"vobiscum": [{ORTH: "vobis"}, {ORTH: "cum"}],
|
||||
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
|
||||
}
|
||||
|
||||
for orth in [
|
||||
"A.",
|
||||
"Agr.",
|
||||
"Ap.",
|
||||
"C.",
|
||||
"Cn.",
|
||||
"D.",
|
||||
"F.",
|
||||
"K.",
|
||||
"L.",
|
||||
"M'.",
|
||||
"M.",
|
||||
"Mam.",
|
||||
"N.",
|
||||
"Oct.",
|
||||
"Opet.",
|
||||
"P.",
|
||||
"Paul.",
|
||||
"Post.",
|
||||
"Pro.",
|
||||
"Q.",
|
||||
"S.",
|
||||
"Ser.",
|
||||
"Sert.",
|
||||
"Sex.",
|
||||
"St.",
|
||||
"Sta.",
|
||||
"T.",
|
||||
"Ti.",
|
||||
"V.",
|
||||
"Vol.",
|
||||
"Vop.",
|
||||
"U.",
|
||||
"Uol.",
|
||||
"Uop.",
|
||||
"Ian.",
|
||||
"Febr.",
|
||||
"Mart.",
|
||||
"Apr.",
|
||||
"Mai.",
|
||||
"Iun.",
|
||||
"Iul.",
|
||||
"Aug.",
|
||||
"Sept.",
|
||||
"Oct.",
|
||||
"Nov.",
|
||||
"Nou.",
|
||||
"Dec.",
|
||||
"Non.",
|
||||
"Id.",
|
||||
"A.D.",
|
||||
"Coll.",
|
||||
"Cos.",
|
||||
"Ord.",
|
||||
"Pl.",
|
||||
"S.C.",
|
||||
"Suff.",
|
||||
"Trib.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -1028,8 +1028,8 @@ class Language:
|
|||
raise ValueError(Errors.E109.format(name=name)) from e
|
||||
except Exception as e:
|
||||
error_handler(name, proc, [doc], e)
|
||||
if doc is None:
|
||||
raise ValueError(Errors.E005.format(name=name))
|
||||
if not isinstance(doc, Doc):
|
||||
raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
|
||||
return doc
|
||||
|
||||
def disable_pipes(self, *names) -> "DisabledPipes":
|
||||
|
@ -1063,7 +1063,7 @@ class Language:
|
|||
"""
|
||||
if enable is None and disable is None:
|
||||
raise ValueError(Errors.E991)
|
||||
if disable is not None and isinstance(disable, str):
|
||||
if isinstance(disable, str):
|
||||
disable = [disable]
|
||||
if enable is not None:
|
||||
if isinstance(enable, str):
|
||||
|
@ -1698,9 +1698,9 @@ class Language:
|
|||
config: Union[Dict[str, Any], Config] = {},
|
||||
*,
|
||||
vocab: Union[Vocab, bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
meta: Dict[str, Any] = SimpleFrozenDict(),
|
||||
auto_fill: bool = True,
|
||||
validate: bool = True,
|
||||
|
@ -1711,12 +1711,12 @@ class Language:
|
|||
|
||||
config (Dict[str, Any] / Config): The loaded config.
|
||||
vocab (Vocab): A Vocab object. If True, a vocab is created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable.
|
||||
Disabled pipes will be loaded but they won't be run unless you
|
||||
explicitly enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
|
||||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude.
|
||||
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude.
|
||||
Excluded components won't be loaded.
|
||||
meta (Dict[str, Any]): Meta overrides for nlp.meta.
|
||||
auto_fill (bool): Automatically fill in missing values in config based
|
||||
|
@ -1727,6 +1727,12 @@ class Language:
|
|||
|
||||
DOCS: https://spacy.io/api/language#from_config
|
||||
"""
|
||||
if isinstance(disable, str):
|
||||
disable = [disable]
|
||||
if isinstance(enable, str):
|
||||
enable = [enable]
|
||||
if isinstance(exclude, str):
|
||||
exclude = [exclude]
|
||||
if auto_fill:
|
||||
config = Config(
|
||||
cls.default_config, section_order=CONFIG_SECTION_ORDER
|
||||
|
@ -2031,25 +2037,29 @@ class Language:
|
|||
|
||||
@staticmethod
|
||||
def _resolve_component_status(
|
||||
disable: Iterable[str], enable: Iterable[str], pipe_names: Collection[str]
|
||||
disable: Union[str, Iterable[str]],
|
||||
enable: Union[str, Iterable[str]],
|
||||
pipe_names: Iterable[str],
|
||||
) -> Tuple[str, ...]:
|
||||
"""Derives whether (1) `disable` and `enable` values are consistent and (2)
|
||||
resolves those to a single set of disabled components. Raises an error in
|
||||
case of inconsistency.
|
||||
|
||||
disable (Iterable[str]): Names of components or serialization fields to disable.
|
||||
enable (Iterable[str]): Names of pipeline components to enable.
|
||||
disable (Union[str, Iterable[str]]): Name(s) of component(s) or serialization fields to disable.
|
||||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable.
|
||||
pipe_names (Iterable[str]): Names of all pipeline components.
|
||||
|
||||
RETURNS (Tuple[str, ...]): Names of components to exclude from pipeline w.r.t.
|
||||
specified includes and excludes.
|
||||
"""
|
||||
|
||||
if disable is not None and isinstance(disable, str):
|
||||
if isinstance(disable, str):
|
||||
disable = [disable]
|
||||
to_disable = disable
|
||||
|
||||
if enable:
|
||||
if isinstance(enable, str):
|
||||
enable = [enable]
|
||||
to_disable = [
|
||||
pipe_name for pipe_name in pipe_names if pipe_name not in enable
|
||||
]
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# cython: infer_types=True, cython: profile=True
|
||||
from typing import List
|
||||
from typing import List, Iterable
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport int32_t, int8_t
|
||||
|
@ -868,20 +868,27 @@ class _SetPredicate:
|
|||
|
||||
def __call__(self, Token token):
|
||||
if self.is_extension:
|
||||
value = get_string_id(token._.get(self.attr))
|
||||
value = token._.get(self.attr)
|
||||
else:
|
||||
value = get_token_attr_for_matcher(token.c, self.attr)
|
||||
|
||||
if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
|
||||
if self.predicate in ("IN", "NOT_IN"):
|
||||
if isinstance(value, (str, int)):
|
||||
value = get_string_id(value)
|
||||
else:
|
||||
return False
|
||||
elif self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
|
||||
# ensure that all values are enclosed in a set
|
||||
if self.attr == MORPH:
|
||||
# break up MORPH into individual Feat=Val values
|
||||
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
|
||||
elif isinstance(value, (str, int)):
|
||||
value = set((get_string_id(value),))
|
||||
elif isinstance(value, Iterable) and all(isinstance(v, (str, int)) for v in value):
|
||||
value = set(get_string_id(v) for v in value)
|
||||
else:
|
||||
# treat a single value as a list
|
||||
if isinstance(value, (str, int)):
|
||||
value = set([get_string_id(value)])
|
||||
else:
|
||||
value = set(get_string_id(v) for v in value)
|
||||
return False
|
||||
|
||||
if self.predicate == "IN":
|
||||
return value in self.value
|
||||
elif self.predicate == "NOT_IN":
|
||||
|
|
|
@ -256,6 +256,11 @@ def ko_tokenizer_tokenizer():
|
|||
return nlp.tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def la_tokenizer():
|
||||
return get_lang_class("la")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer_natto():
|
||||
pytest.importorskip("natto")
|
||||
|
|
0
spacy/tests/lang/la/__init__.py
Normal file
0
spacy/tests/lang/la/__init__.py
Normal file
8
spacy/tests/lang/la/test_exception.py
Normal file
8
spacy/tests/lang/la/test_exception.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
import pytest
|
||||
|
||||
|
||||
def test_la_tokenizer_handles_exc_in_text(la_tokenizer):
|
||||
text = "scio te omnia facturum, ut nobiscum quam primum sis"
|
||||
tokens = la_tokenizer(text)
|
||||
assert len(tokens) == 11
|
||||
assert tokens[6].text == "nobis"
|
35
spacy/tests/lang/la/test_text.py
Normal file
35
spacy/tests/lang/la/test_text.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
import pytest
|
||||
from spacy.lang.la.lex_attrs import like_num
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("IIII", True),
|
||||
("VI", True),
|
||||
("vi", True),
|
||||
("IV", True),
|
||||
("iv", True),
|
||||
("IX", True),
|
||||
("ix", True),
|
||||
("MMXXII", True),
|
||||
("0", True),
|
||||
("1", True),
|
||||
("quattuor", True),
|
||||
("decem", True),
|
||||
("tertius", True),
|
||||
("canis", False),
|
||||
("MMXX11", False),
|
||||
(",", False),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(la_tokenizer, text, match):
|
||||
tokens = la_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["quinque"])
|
||||
def test_la_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
|
@ -368,6 +368,16 @@ def test_matcher_intersect_value_operator(en_vocab):
|
|||
doc[0]._.ext = ["A", "B"]
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
# INTERSECTS matches nothing for iterables that aren't all str or int
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"_": {"ext": {"INTERSECTS": ["Abx", "C"]}}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
doc[0]._.ext = [["Abx"], "B"]
|
||||
assert len(matcher(doc)) == 0
|
||||
doc[0]._.ext = ["Abx", "B"]
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
# INTERSECTS with an empty pattern list matches nothing
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"_": {"ext": {"INTERSECTS": []}}}]
|
||||
|
@ -476,14 +486,22 @@ def test_matcher_extension_set_membership(en_vocab):
|
|||
assert len(matches) == 0
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="IN predicate must handle sequence values in extensions")
|
||||
def test_matcher_extension_in_set_predicate(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
Token.set_extension("ext", default=[])
|
||||
pattern = [{"_": {"ext": {"IN": ["A", "C"]}}}]
|
||||
matcher.add("M", [pattern])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||
|
||||
# The IN predicate expects an exact match between the
|
||||
# extension value and one of the pattern's values.
|
||||
doc[0]._.ext = ["A", "B"]
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
doc[0]._.ext = ["A"]
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
doc[0]._.ext = "A"
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ def test_build_dependencies():
|
|||
"types-dataclasses",
|
||||
"types-mock",
|
||||
"types-requests",
|
||||
"types-setuptools",
|
||||
]
|
||||
# ignore language-specific packages that shouldn't be installed by all
|
||||
libs_ignore_setup = [
|
||||
|
|
|
@ -618,6 +618,7 @@ def test_load_disable_enable() -> None:
|
|||
base_nlp.to_disk(tmp_dir)
|
||||
to_disable = ["parser", "tagger"]
|
||||
to_enable = ["tagger", "parser"]
|
||||
single_str = "tagger"
|
||||
|
||||
# Setting only `disable`.
|
||||
nlp = spacy.load(tmp_dir, disable=to_disable)
|
||||
|
@ -632,6 +633,16 @@ def test_load_disable_enable() -> None:
|
|||
]
|
||||
)
|
||||
|
||||
# Loading with a string representing one component
|
||||
nlp = spacy.load(tmp_dir, exclude=single_str)
|
||||
assert single_str not in nlp.component_names
|
||||
|
||||
nlp = spacy.load(tmp_dir, disable=single_str)
|
||||
assert single_str in nlp.component_names
|
||||
assert single_str not in nlp.pipe_names
|
||||
assert nlp._disabled == {single_str}
|
||||
assert nlp.disabled == [single_str]
|
||||
|
||||
# Testing consistent enable/disable combination.
|
||||
nlp = spacy.load(
|
||||
tmp_dir,
|
||||
|
|
|
@ -670,3 +670,25 @@ def test_dot_in_factory_names(nlp):
|
|||
|
||||
with pytest.raises(ValueError, match="not permitted"):
|
||||
Language.factory("my.evil.component.v1", func=evil_component)
|
||||
|
||||
|
||||
def test_component_return():
|
||||
"""Test that an error is raised if components return a type other than a
|
||||
doc."""
|
||||
nlp = English()
|
||||
|
||||
@Language.component("test_component_good_pipe")
|
||||
def good_pipe(doc):
|
||||
return doc
|
||||
|
||||
nlp.add_pipe("test_component_good_pipe")
|
||||
nlp("text")
|
||||
nlp.remove_pipe("test_component_good_pipe")
|
||||
|
||||
@Language.component("test_component_bad_pipe")
|
||||
def bad_pipe(doc):
|
||||
return doc.text
|
||||
|
||||
nlp.add_pipe("test_component_bad_pipe")
|
||||
with pytest.raises(ValueError, match="instead of a Doc"):
|
||||
nlp("text")
|
||||
|
|
|
@ -10,7 +10,8 @@ from spacy.ml._precomputable_affine import _backprop_precomputable_affine_paddin
|
|||
from spacy.util import dot_to_object, SimpleFrozenList, import_file
|
||||
from spacy.util import to_ternary_int
|
||||
from thinc.api import Config, Optimizer, ConfigValidationError
|
||||
from thinc.api import set_current_ops
|
||||
from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
|
||||
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
|
||||
from spacy.training.batchers import minibatch_by_words
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.nl import Dutch
|
||||
|
@ -18,7 +19,6 @@ from spacy.language import DEFAULT_CONFIG_PATH
|
|||
from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema
|
||||
from pydantic import ValidationError
|
||||
|
||||
from thinc.api import get_current_ops, NumpyOps, CupyOps
|
||||
|
||||
from .util import get_random_doc, make_tempdir
|
||||
|
||||
|
@ -111,26 +111,25 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
|||
|
||||
def test_prefer_gpu():
|
||||
current_ops = get_current_ops()
|
||||
try:
|
||||
import cupy # noqa: F401
|
||||
|
||||
prefer_gpu()
|
||||
if has_cupy_gpu:
|
||||
assert prefer_gpu()
|
||||
assert isinstance(get_current_ops(), CupyOps)
|
||||
except ImportError:
|
||||
elif has_torch_mps_gpu:
|
||||
assert prefer_gpu()
|
||||
assert isinstance(get_current_ops(), MPSOps)
|
||||
else:
|
||||
assert not prefer_gpu()
|
||||
set_current_ops(current_ops)
|
||||
|
||||
|
||||
def test_require_gpu():
|
||||
current_ops = get_current_ops()
|
||||
try:
|
||||
import cupy # noqa: F401
|
||||
|
||||
if has_cupy_gpu:
|
||||
require_gpu()
|
||||
assert isinstance(get_current_ops(), CupyOps)
|
||||
except ImportError:
|
||||
with pytest.raises(ValueError):
|
||||
require_gpu()
|
||||
elif has_torch_mps_gpu:
|
||||
require_gpu()
|
||||
assert isinstance(get_current_ops(), MPSOps)
|
||||
set_current_ops(current_ops)
|
||||
|
||||
|
||||
|
|
30
spacy/tests/training/test_logger.py
Normal file
30
spacy/tests/training/test_logger.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
import pytest
|
||||
import spacy
|
||||
|
||||
from spacy.training import loggers
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def nlp():
|
||||
nlp = spacy.blank("en")
|
||||
nlp.add_pipe("ner")
|
||||
return nlp
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def info():
|
||||
return {
|
||||
"losses": {"ner": 100},
|
||||
"other_scores": {"ENTS_F": 0.85, "ENTS_P": 0.90, "ENTS_R": 0.80},
|
||||
"epoch": 100,
|
||||
"step": 125,
|
||||
"score": 85,
|
||||
}
|
||||
|
||||
|
||||
def test_console_logger(nlp, info):
|
||||
console_logger = loggers.console_logger(
|
||||
progress_bar=True, console_output=True, output_file=None
|
||||
)
|
||||
log_step, finalize = console_logger(nlp)
|
||||
log_step(info)
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Any, Dict, Iterable
|
||||
from typing import Any, Dict, Iterable, Optional
|
||||
from .doc import Doc
|
||||
from .span import Span
|
||||
|
||||
|
@ -24,4 +24,4 @@ class SpanGroup:
|
|||
def __getitem__(self, i: int) -> Span: ...
|
||||
def to_bytes(self) -> bytes: ...
|
||||
def from_bytes(self, bytes_data: bytes) -> SpanGroup: ...
|
||||
def copy(self) -> SpanGroup: ...
|
||||
def copy(self, doc: Optional[Doc] = ...) -> SpanGroup: ...
|
||||
|
|
|
@ -244,15 +244,18 @@ cdef class SpanGroup:
|
|||
cdef void push_back(self, const shared_ptr[SpanC] &span):
|
||||
self.c.push_back(span)
|
||||
|
||||
def copy(self) -> SpanGroup:
|
||||
def copy(self, doc: Optional["Doc"] = None) -> SpanGroup:
|
||||
"""Clones the span group.
|
||||
|
||||
doc (Doc): New reference document to which the copy is bound.
|
||||
RETURNS (SpanGroup): A copy of the span group.
|
||||
|
||||
DOCS: https://spacy.io/api/spangroup#copy
|
||||
"""
|
||||
if doc is None:
|
||||
doc = self.doc
|
||||
return SpanGroup(
|
||||
self.doc,
|
||||
doc,
|
||||
name=self.name,
|
||||
attrs=deepcopy(self.attrs),
|
||||
spans=list(self),
|
||||
|
|
|
@ -42,7 +42,8 @@ class SpanGroups(UserDict):
|
|||
def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups":
|
||||
if doc is None:
|
||||
doc = self._ensure_doc()
|
||||
return SpanGroups(doc).from_bytes(self.to_bytes())
|
||||
data_copy = ((k, v.copy(doc=doc)) for k, v in self.items())
|
||||
return SpanGroups(doc, items=data_copy)
|
||||
|
||||
def setdefault(self, key, default=None):
|
||||
if not isinstance(default, SpanGroup):
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO
|
||||
from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO, Union
|
||||
from wasabi import Printer
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
import sys
|
||||
import srsly
|
||||
|
||||
from ..util import registry
|
||||
from ..errors import Errors
|
||||
from .. import util
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..language import Language # noqa: F401
|
||||
|
@ -23,13 +26,44 @@ def setup_table(
|
|||
return final_cols, final_widths, ["r" for _ in final_widths]
|
||||
|
||||
|
||||
@registry.loggers("spacy.ConsoleLogger.v1")
|
||||
def console_logger(progress_bar: bool = False):
|
||||
@registry.loggers("spacy.ConsoleLogger.v2")
|
||||
def console_logger(
|
||||
progress_bar: bool = False,
|
||||
console_output: bool = True,
|
||||
output_file: Optional[Union[str, Path]] = None,
|
||||
):
|
||||
"""The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
|
||||
progress_bar (bool): Whether the logger should print the progress bar.
|
||||
console_output (bool): Whether the logger should print the logs on the console.
|
||||
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
|
||||
"""
|
||||
_log_exist = False
|
||||
if output_file:
|
||||
output_file = util.ensure_path(output_file) # type: ignore
|
||||
if output_file.exists(): # type: ignore
|
||||
_log_exist = True
|
||||
if not output_file.parents[0].exists(): # type: ignore
|
||||
output_file.parents[0].mkdir(parents=True) # type: ignore
|
||||
|
||||
def setup_printer(
|
||||
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
|
||||
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
|
||||
write = lambda text: print(text, file=stdout, flush=True)
|
||||
msg = Printer(no_print=True)
|
||||
|
||||
nonlocal output_file
|
||||
output_stream = None
|
||||
if _log_exist:
|
||||
write(
|
||||
msg.warn(
|
||||
f"Saving logs is disabled because {output_file} already exists."
|
||||
)
|
||||
)
|
||||
output_file = None
|
||||
elif output_file:
|
||||
write(msg.info(f"Saving results to {output_file}"))
|
||||
output_stream = open(output_file, "w", encoding="utf-8")
|
||||
|
||||
# ensure that only trainable components are logged
|
||||
logged_pipes = [
|
||||
name
|
||||
|
@ -40,13 +74,15 @@ def console_logger(progress_bar: bool = False):
|
|||
score_weights = nlp.config["training"]["score_weights"]
|
||||
score_cols = [col for col, value in score_weights.items() if value is not None]
|
||||
loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
|
||||
spacing = 2
|
||||
table_header, table_widths, table_aligns = setup_table(
|
||||
cols=["E", "#"] + loss_cols + score_cols + ["Score"],
|
||||
widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
|
||||
)
|
||||
write(msg.row(table_header, widths=table_widths, spacing=spacing))
|
||||
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
|
||||
|
||||
if console_output:
|
||||
spacing = 2
|
||||
table_header, table_widths, table_aligns = setup_table(
|
||||
cols=["E", "#"] + loss_cols + score_cols + ["Score"],
|
||||
widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
|
||||
)
|
||||
write(msg.row(table_header, widths=table_widths, spacing=spacing))
|
||||
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
|
||||
progress = None
|
||||
|
||||
def log_step(info: Optional[Dict[str, Any]]) -> None:
|
||||
|
@ -57,12 +93,15 @@ def console_logger(progress_bar: bool = False):
|
|||
if progress is not None:
|
||||
progress.update(1)
|
||||
return
|
||||
losses = [
|
||||
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||
for pipe_name in logged_pipes
|
||||
]
|
||||
|
||||
losses = []
|
||||
log_losses = {}
|
||||
for pipe_name in logged_pipes:
|
||||
losses.append("{0:.2f}".format(float(info["losses"][pipe_name])))
|
||||
log_losses[pipe_name] = float(info["losses"][pipe_name])
|
||||
|
||||
scores = []
|
||||
log_scores = {}
|
||||
for col in score_cols:
|
||||
score = info["other_scores"].get(col, 0.0)
|
||||
try:
|
||||
|
@ -73,6 +112,7 @@ def console_logger(progress_bar: bool = False):
|
|||
if col != "speed":
|
||||
score *= 100
|
||||
scores.append("{0:.2f}".format(score))
|
||||
log_scores[str(col)] = score
|
||||
|
||||
data = (
|
||||
[info["epoch"], info["step"]]
|
||||
|
@ -80,20 +120,36 @@ def console_logger(progress_bar: bool = False):
|
|||
+ scores
|
||||
+ ["{0:.2f}".format(float(info["score"]))]
|
||||
)
|
||||
|
||||
if output_stream:
|
||||
# Write to log file per log_step
|
||||
log_data = {
|
||||
"epoch": info["epoch"],
|
||||
"step": info["step"],
|
||||
"losses": log_losses,
|
||||
"scores": log_scores,
|
||||
"score": float(info["score"]),
|
||||
}
|
||||
output_stream.write(srsly.json_dumps(log_data) + "\n")
|
||||
|
||||
if progress is not None:
|
||||
progress.close()
|
||||
write(
|
||||
msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing)
|
||||
)
|
||||
if progress_bar:
|
||||
# Set disable=None, so that it disables on non-TTY
|
||||
progress = tqdm.tqdm(
|
||||
total=eval_frequency, disable=None, leave=False, file=stderr
|
||||
if console_output:
|
||||
write(
|
||||
msg.row(
|
||||
data, widths=table_widths, aligns=table_aligns, spacing=spacing
|
||||
)
|
||||
)
|
||||
progress.set_description(f"Epoch {info['epoch']+1}")
|
||||
if progress_bar:
|
||||
# Set disable=None, so that it disables on non-TTY
|
||||
progress = tqdm.tqdm(
|
||||
total=eval_frequency, disable=None, leave=False, file=stderr
|
||||
)
|
||||
progress.set_description(f"Epoch {info['epoch']+1}")
|
||||
|
||||
def finalize() -> None:
|
||||
pass
|
||||
if output_stream:
|
||||
output_stream.close()
|
||||
|
||||
return log_step, finalize
|
||||
|
||||
|
|
|
@ -398,9 +398,9 @@ def load_model(
|
|||
name: Union[str, Path],
|
||||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
"""Load a model from a package or data path.
|
||||
|
@ -408,9 +408,9 @@ def load_model(
|
|||
name (str): Package name or model path.
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All others will be disabled.
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude.
|
||||
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable.
|
||||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All others will be disabled.
|
||||
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
|
@ -440,9 +440,9 @@ def load_model_from_package(
|
|||
name: str,
|
||||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
"""Load a model from an installed package.
|
||||
|
@ -450,12 +450,12 @@ def load_model_from_package(
|
|||
name (str): The package name.
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
|
||||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
|
@ -470,9 +470,9 @@ def load_model_from_path(
|
|||
*,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
"""Load a model from a data directory path. Creates Language class with
|
||||
|
@ -482,12 +482,12 @@ def load_model_from_path(
|
|||
meta (Dict[str, Any]): Optional model meta.
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
|
||||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
|
@ -516,9 +516,9 @@ def load_model_from_config(
|
|||
*,
|
||||
meta: Dict[str, Any] = SimpleFrozenDict(),
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
auto_fill: bool = False,
|
||||
validate: bool = True,
|
||||
) -> "Language":
|
||||
|
@ -529,12 +529,12 @@ def load_model_from_config(
|
|||
meta (Dict[str, Any]): Optional model meta.
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
|
||||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
|
||||
components won't be loaded.
|
||||
auto_fill (bool): Whether to auto-fill config with missing defaults.
|
||||
validate (bool): Whether to show config validation errors.
|
||||
|
@ -616,9 +616,9 @@ def load_model_from_init_py(
|
|||
init_file: Union[Path, str],
|
||||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
enable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
"""Helper function to use in the `load()` method of a model package's
|
||||
|
@ -626,12 +626,12 @@ def load_model_from_init_py(
|
|||
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Iterable[str]): Names of pipeline components to enable. All other
|
||||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
|
||||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
|
|
|
@ -77,14 +77,15 @@ $ python -m spacy info [--markdown] [--silent] [--exclude]
|
|||
$ python -m spacy info [model] [--markdown] [--silent] [--exclude]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~ |
|
||||
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
|
||||
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
|
||||
| `--exclude`, `-e` | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Information about your spaCy installation. |
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~ |
|
||||
| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ |
|
||||
| `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~ |
|
||||
| `--exclude`, `-e` | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ |
|
||||
| `--url`, `-u` <Tag variant="new">3.5.0</Tag> | Print the URL to download the most recent compatible version of the pipeline. Requires a pipeline name. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Information about your spaCy installation. |
|
||||
|
||||
## validate {#validate new="2" tag="command"}
|
||||
|
||||
|
|
|
@ -63,17 +63,18 @@ spaCy loads a model under the hood based on its
|
|||
> nlp = Language.from_config(config)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
|
||||
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
|
||||
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The initialized object. ~~Language~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
||||
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
||||
| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
|
||||
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
|
||||
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
|
||||
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The initialized object. ~~Language~~ |
|
||||
|
||||
## Language.component {#component tag="classmethod" new="3"}
|
||||
|
||||
|
@ -695,8 +696,8 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
|
|||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| _keyword-only_ | |
|
||||
| `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ |
|
||||
| `enable` | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
|
||||
| `disable` | Name(s) of pipeline component(s) to disable. ~~Optional[Union[str, Iterable[str]]]~~ |
|
||||
| `enable` | Name(s) of pipeline component(s) that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
|
||||
| **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ |
|
||||
|
||||
## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}
|
||||
|
|
|
@ -248,6 +248,59 @@ added to an existing vectors table. See more details in
|
|||
|
||||
## Loggers {#loggers}
|
||||
|
||||
These functions are available from `@spacy.registry.loggers`.
|
||||
|
||||
### spacy.ConsoleLogger.v1 {#ConsoleLogger_v1}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [training.logger]
|
||||
> @loggers = "spacy.ConsoleLogger.v1"
|
||||
> progress_bar = true
|
||||
> ```
|
||||
|
||||
Writes the results of a training step to the console in a tabular format.
|
||||
|
||||
<Accordion title="Example console output" spaced>
|
||||
|
||||
```cli
|
||||
$ python -m spacy train config.cfg
|
||||
```
|
||||
|
||||
```
|
||||
ℹ Using CPU
|
||||
ℹ Loading config and nlp from: config.cfg
|
||||
ℹ Pipeline: ['tok2vec', 'tagger']
|
||||
ℹ Start training
|
||||
ℹ Training. Initial learn rate: 0.0
|
||||
|
||||
E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE
|
||||
--- ------ ------------ ----------- ------- ------
|
||||
0 0 0.00 86.20 0.22 0.00
|
||||
0 200 3.08 18968.78 34.00 0.34
|
||||
0 400 31.81 22539.06 33.64 0.34
|
||||
0 600 92.13 22794.91 43.80 0.44
|
||||
0 800 183.62 21541.39 56.05 0.56
|
||||
0 1000 352.49 25461.82 65.15 0.65
|
||||
0 1200 422.87 23708.82 71.84 0.72
|
||||
0 1400 601.92 24994.79 76.57 0.77
|
||||
0 1600 662.57 22268.02 80.20 0.80
|
||||
0 1800 1101.50 28413.77 82.56 0.83
|
||||
0 2000 1253.43 28736.36 85.00 0.85
|
||||
0 2200 1411.02 28237.53 87.42 0.87
|
||||
0 2400 1605.35 28439.95 88.70 0.89
|
||||
```
|
||||
|
||||
Note that the cumulative loss keeps increasing within one epoch, but should
|
||||
start decreasing across epochs.
|
||||
|
||||
</Accordion>
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | --------------------------------------------------------- |
|
||||
| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
|
||||
|
||||
Logging utilities for spaCy are implemented in the
|
||||
[`spacy-loggers`](https://github.com/explosion/spacy-loggers) repo, and the
|
||||
functions are typically available from `@spacy.registry.loggers`.
|
||||
|
|
|
@ -255,9 +255,10 @@ Return a copy of the span group.
|
|||
> new_group = doc.spans["errors"].copy()
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------- |
|
||||
| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | The document to which the copy is bound. Defaults to `None` for the current doc. ~~Optional[Doc]~~ |
|
||||
| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
|
||||
|
||||
## SpanGroup.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
|
|
@ -45,16 +45,16 @@ specified separately using the new `exclude` keyword argument.
|
|||
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `enable` | Names of pipeline components to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
||||
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
|
||||
|
||||
Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
|
||||
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
|
||||
|
@ -275,8 +275,8 @@ Render a dependency parse tree or named entity visualization.
|
|||
|
||||
### displacy.parse_deps {#displacy.parse_deps tag="method" new="2"}
|
||||
|
||||
Generate dependency parse in `{'words': [], 'arcs': []}` format.
|
||||
For use with the `manual=True` argument in `displacy.render`.
|
||||
Generate dependency parse in `{'words': [], 'arcs': []}` format. For use with
|
||||
the `manual=True` argument in `displacy.render`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -297,8 +297,8 @@ For use with the `manual=True` argument in `displacy.render`.
|
|||
|
||||
### displacy.parse_ents {#displacy.parse_ents tag="method" new="2"}
|
||||
|
||||
Generate named entities in `[{start: i, end: i, label: 'label'}]` format.
|
||||
For use with the `manual=True` argument in `displacy.render`.
|
||||
Generate named entities in `[{start: i, end: i, label: 'label'}]` format. For
|
||||
use with the `manual=True` argument in `displacy.render`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -319,8 +319,8 @@ For use with the `manual=True` argument in `displacy.render`.
|
|||
|
||||
### displacy.parse_spans {#displacy.parse_spans tag="method" new="2"}
|
||||
|
||||
Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format.
|
||||
For use with the `manual=True` argument in `displacy.render`.
|
||||
Generate spans in `[{start_token: i, end_token: i, label: 'label'}]` format. For
|
||||
use with the `manual=True` argument in `displacy.render`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -451,7 +451,7 @@ factories.
|
|||
| Registry name | Description |
|
||||
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
||||
| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. |
|
||||
| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. |
|
||||
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
|
||||
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
|
||||
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||
|
@ -505,7 +505,7 @@ finished. To log each training step, a
|
|||
and the accuracy scores on the development set.
|
||||
|
||||
The built-in, default logger is the ConsoleLogger, which prints results to the
|
||||
console in tabular format. The
|
||||
console in tabular format and saves them to a `jsonl` file. The
|
||||
[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
|
||||
a dependency of spaCy, enables other loggers, such as one that sends results to
|
||||
a [Weights & Biases](https://www.wandb.com/) dashboard.
|
||||
|
@ -513,16 +513,20 @@ a [Weights & Biases](https://www.wandb.com/) dashboard.
|
|||
Instead of using one of the built-in loggers, you can
|
||||
[implement your own](/usage/training#custom-logging).
|
||||
|
||||
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
|
||||
#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [training.logger]
|
||||
> @loggers = "spacy.ConsoleLogger.v1"
|
||||
> @loggers = "spacy.ConsoleLogger.v2"
|
||||
> progress_bar = true
|
||||
> console_output = true
|
||||
> output_file = "training_log.jsonl"
|
||||
> ```
|
||||
|
||||
Writes the results of a training step to the console in a tabular format.
|
||||
Writes the results of a training step to the console in a tabular format and
|
||||
saves them to a `jsonl` file.
|
||||
|
||||
<Accordion title="Example console output" spaced>
|
||||
|
||||
|
@ -536,22 +540,23 @@ $ python -m spacy train config.cfg
|
|||
ℹ Pipeline: ['tok2vec', 'tagger']
|
||||
ℹ Start training
|
||||
ℹ Training. Initial learn rate: 0.0
|
||||
ℹ Saving results to training_log.jsonl
|
||||
|
||||
E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE
|
||||
--- ------ ------------ ----------- ------- ------
|
||||
1 0 0.00 86.20 0.22 0.00
|
||||
1 200 3.08 18968.78 34.00 0.34
|
||||
1 400 31.81 22539.06 33.64 0.34
|
||||
1 600 92.13 22794.91 43.80 0.44
|
||||
1 800 183.62 21541.39 56.05 0.56
|
||||
1 1000 352.49 25461.82 65.15 0.65
|
||||
1 1200 422.87 23708.82 71.84 0.72
|
||||
1 1400 601.92 24994.79 76.57 0.77
|
||||
1 1600 662.57 22268.02 80.20 0.80
|
||||
1 1800 1101.50 28413.77 82.56 0.83
|
||||
1 2000 1253.43 28736.36 85.00 0.85
|
||||
1 2200 1411.02 28237.53 87.42 0.87
|
||||
1 2400 1605.35 28439.95 88.70 0.89
|
||||
0 0 0.00 86.20 0.22 0.00
|
||||
0 200 3.08 18968.78 34.00 0.34
|
||||
0 400 31.81 22539.06 33.64 0.34
|
||||
0 600 92.13 22794.91 43.80 0.44
|
||||
0 800 183.62 21541.39 56.05 0.56
|
||||
0 1000 352.49 25461.82 65.15 0.65
|
||||
0 1200 422.87 23708.82 71.84 0.72
|
||||
0 1400 601.92 24994.79 76.57 0.77
|
||||
0 1600 662.57 22268.02 80.20 0.80
|
||||
0 1800 1101.50 28413.77 82.56 0.83
|
||||
0 2000 1253.43 28736.36 85.00 0.85
|
||||
0 2200 1411.02 28237.53 87.42 0.87
|
||||
0 2400 1605.35 28439.95 88.70 0.89
|
||||
```
|
||||
|
||||
Note that the cumulative loss keeps increasing within one epoch, but should
|
||||
|
@ -559,6 +564,12 @@ start decreasing across epochs.
|
|||
|
||||
</Accordion>
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | --------------------------------------------------------------------- |
|
||||
| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
|
||||
| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ |
|
||||
| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ |
|
||||
|
||||
## Readers {#readers}
|
||||
|
||||
### File readers {#file-readers source="github.com/explosion/srsly" new="3"}
|
||||
|
@ -1038,15 +1049,16 @@ and create a `Language` object. The model data will then be loaded in via
|
|||
> nlp = util.load_model("/path/to/data")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | Package name or path. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Package name or path. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
||||
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
||||
| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||
|
||||
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
|
||||
|
||||
|
@ -1062,15 +1074,16 @@ A helper function to use in the `load()` method of a pipeline package's
|
|||
> return load_model_from_init_py(__file__, **overrides)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `init_file` | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
||||
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||
|
||||
### util.load_config {#util.load_config tag="function" new="3"}
|
||||
|
||||
|
|
|
@ -396,15 +396,32 @@ pipeline package can be found.
|
|||
To download a trained pipeline directly using
|
||||
[pip](https://pypi.python.org/pypi/pip), point `pip install` to the URL or local
|
||||
path of the wheel file or archive. Installing the wheel is usually more
|
||||
efficient. To find the direct link to a package, head over to the
|
||||
[releases](https://github.com/explosion/spacy-models/releases), right click on
|
||||
the archive link and copy it to your clipboard.
|
||||
efficient.
|
||||
|
||||
> #### Pipeline Package URLs {#pipeline-urls}
|
||||
>
|
||||
> Pretrained pipeline distributions are hosted on
|
||||
> [Github Releases](https://github.com/explosion/spacy-models/releases), and you
|
||||
> can find download links there, as well as on the model page. You can also get
|
||||
> URLs directly from the command line by using `spacy info` with the `--url`
|
||||
> flag, which may be useful for automation.
|
||||
>
|
||||
> ```bash
|
||||
> spacy info en_core_web_sm --url
|
||||
> ```
|
||||
>
|
||||
> This command will print the URL for the latest version of a pipeline
|
||||
> compatible with the version of spaCy you're using. Note that in order to look
|
||||
> up the compatibility information an internet connection is required.
|
||||
|
||||
```bash
|
||||
# With external URL
|
||||
$ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl
|
||||
$ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
|
||||
|
||||
# Using spacy info to get the external URL
|
||||
$ pip install $(spacy info en_core_web_sm --url)
|
||||
|
||||
# With local file
|
||||
$ pip install /Users/you/en_core_web_sm-3.0.0-py3-none-any.whl
|
||||
$ pip install /Users/you/en_core_web_sm-3.0.0.tar.gz
|
||||
|
@ -545,21 +562,16 @@ should be specifying them directly.
|
|||
Because pipeline packages are valid Python packages, you can add them to your
|
||||
application's `requirements.txt`. If you're running your own internal PyPi
|
||||
installation, you can upload the pipeline packages there. pip's
|
||||
[requirements file format](https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format)
|
||||
supports both package names to download via a PyPi server, as well as direct
|
||||
URLs.
|
||||
[requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/)
|
||||
supports both package names to download via a PyPi server, as well as
|
||||
[direct URLs](#pipeline-urls).
|
||||
|
||||
```text
|
||||
### requirements.txt
|
||||
spacy>=3.0.0,<4.0.0
|
||||
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web_sm
|
||||
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl
|
||||
```
|
||||
|
||||
Specifying `#egg=` with the package name tells pip which package to expect from
|
||||
the download URL. This way, the package won't be re-downloaded and overwritten
|
||||
if it's already installed - just like when you're downloading a package from
|
||||
PyPi.
|
||||
|
||||
All pipeline packages are versioned and specify their spaCy dependency. This
|
||||
ensures cross-compatibility and lets you specify exact version requirements for
|
||||
each pipeline. If you've [trained](/usage/training) your own pipeline, you can
|
||||
|
|
|
@ -1192,7 +1192,7 @@
|
|||
"slogan": "Fast, flexible and transparent sentiment analysis",
|
||||
"description": "Asent is a rule-based sentiment analysis library for Python made using spaCy. It is inspired by VADER, but uses a more modular ruleset, that allows the user to change e.g. the method for finding negations. Furthermore it includes visualisers to visualize the model predictions, making the model easily interpretable.",
|
||||
"github": "kennethenevoldsen/asent",
|
||||
"pip": "aseny",
|
||||
"pip": "asent",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"import asent",
|
||||
|
|
|
@ -76,6 +76,7 @@ const MODEL_META = {
|
|||
benchmark_ner: 'NER accuracy',
|
||||
benchmark_speed: 'Speed',
|
||||
compat: 'Latest compatible package version for your spaCy installation',
|
||||
download_link: 'Download link for the pipeline',
|
||||
}
|
||||
|
||||
const LABEL_SCHEME_META = {
|
||||
|
@ -138,6 +139,13 @@ function formatAccuracy(data, lang) {
|
|||
.filter(item => item)
|
||||
}
|
||||
|
||||
function formatDownloadLink(lang, name, version) {
|
||||
const fullName = `${lang}_${name}-${version}`
|
||||
const filename = `${fullName}-py3-none-any.whl`
|
||||
const url = `https://github.com/explosion/spacy-models/releases/download/${fullName}/${filename}`
|
||||
return <Link to={url} hideIcon>{filename}</Link>
|
||||
}
|
||||
|
||||
function formatModelMeta(data) {
|
||||
return {
|
||||
fullName: `${data.lang}_${data.name}-${data.version}`,
|
||||
|
@ -154,6 +162,7 @@ function formatModelMeta(data) {
|
|||
labels: isEmptyObj(data.labels) ? null : data.labels,
|
||||
vectors: formatVectors(data.vectors),
|
||||
accuracy: formatAccuracy(data.performance, data.lang),
|
||||
download_link: formatDownloadLink(data.lang, data.name, data.version),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -244,6 +253,7 @@ const Model = ({
|
|||
{ label: 'Components', content: components, help: MODEL_META.components },
|
||||
{ label: 'Pipeline', content: pipeline, help: MODEL_META.pipeline },
|
||||
{ label: 'Vectors', content: meta.vectors, help: MODEL_META.vecs },
|
||||
{ label: 'Download Link', content: meta.download_link, help: MODEL_META.download_link },
|
||||
{ label: 'Sources', content: sources, help: MODEL_META.sources },
|
||||
{ label: 'Author', content: author },
|
||||
{ label: 'License', content: license },
|
||||
|
|
Loading…
Reference in New Issue
Block a user