mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 16:24:16 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
c379a4274a
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
SHELL := /bin/bash
|
||||
|
||||
ifndef SPACY_EXTRAS
|
||||
override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
|
||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
|
||||
endif
|
||||
|
||||
ifndef PYVER
|
||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a41,<8.0.0a50",
|
||||
"thinc>=8.0.0a42,<8.0.0a50",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations",
|
||||
"pathy"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a41,<8.0.0a50
|
||||
thinc>=8.0.0a42,<8.0.0a50
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets==0.2.0a0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a41,<8.0.0a50
|
||||
thinc>=8.0.0a42,<8.0.0a50
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a41,<8.0.0a50
|
||||
thinc>=8.0.0a42,<8.0.0a50
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.8.0,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
|
@ -65,7 +65,7 @@ console_scripts =
|
|||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
spacy_lookups_data==0.4.0.dev0
|
||||
spacy_lookups_data==1.0.0rc0
|
||||
cuda =
|
||||
cupy>=5.0.0b4,<9.0.0
|
||||
cuda80 =
|
||||
|
|
|
@ -16,6 +16,7 @@ import os
|
|||
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
from ..util import ENV_VARS
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
|
@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data,
|
|||
and custom model implementations.
|
||||
"""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
|
@ -65,7 +65,7 @@ def setup_cli() -> None:
|
|||
|
||||
|
||||
def parse_config_overrides(
|
||||
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
|
||||
args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate a dictionary of config overrides based on the extra arguments
|
||||
provided on the CLI, e.g. --training.batch_size to override
|
||||
|
|
|
@ -27,7 +27,7 @@ def init_vectors_cli(
|
|||
you can use in the [initialize.vocab] block of your config to initialize
|
||||
a model with vectors.
|
||||
"""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||
nlp = util.get_lang_class(lang)()
|
||||
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||
|
@ -55,14 +55,14 @@ def init_pipeline_cli(
|
|||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
with show_validation_error(hint_fill=False):
|
||||
nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
|
||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||
nlp.to_disk(output_path)
|
||||
msg.good(f"Saved initialized pipeline to {output_path}")
|
||||
|
||||
|
@ -81,9 +81,12 @@ def init_labels_cli(
|
|||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
"""Generate a JSON file for labels in the data. This helps speed up the
|
||||
training process, since spaCy won't have to preprocess the data to
|
||||
extract the labels."""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
|
@ -93,7 +96,8 @@ def init_labels_cli(
|
|||
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||
for name, component in nlp.pipeline:
|
||||
if getattr(component, "label_data", None) is not None:
|
||||
srsly.write_json(output_path / f"{name}.json", component.label_data)
|
||||
msg.good(f"Saving {name} labels to {output_path}/{name}.json")
|
||||
output_file = output_path / f"{name}.json"
|
||||
srsly.write_json(output_file, component.label_data)
|
||||
msg.good(f"Saving {name} labels to {output_file}")
|
||||
else:
|
||||
msg.info(f"No labels found for {name}")
|
||||
|
|
|
@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
|
|||
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
[paths]
|
||||
train = ""
|
||||
dev = ""
|
||||
train = null
|
||||
dev = null
|
||||
|
||||
[system]
|
||||
{% if use_transformer -%}
|
||||
|
|
|
@ -40,7 +40,7 @@ def train_cli(
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/cli#train
|
||||
"""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
verify_cli_args(config_path, output_path)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
|
@ -50,6 +50,7 @@ def train_cli(
|
|||
msg.divider("Initializing pipeline")
|
||||
with show_validation_error(config_path, hint_fill=False):
|
||||
nlp = init_nlp(config, use_gpu=use_gpu)
|
||||
msg.good("Initialized pipeline")
|
||||
msg.divider("Training pipeline")
|
||||
train(nlp, output_path, use_gpu=use_gpu, silent=False)
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[paths]
|
||||
train = ""
|
||||
dev = ""
|
||||
train = null
|
||||
dev = null
|
||||
vectors = null
|
||||
vocab_data = null
|
||||
init_tok2vec = null
|
||||
|
|
|
@ -477,6 +477,8 @@ class Errors:
|
|||
E201 = ("Span index out of range.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||
"config.cfg or override it on the CLI?")
|
||||
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||
"return the nlp object but got: {value}. Maybe you forgot to return "
|
||||
"the modified object in your function?")
|
||||
|
|
|
@ -36,7 +36,7 @@ cdef class Pipe:
|
|||
@property
|
||||
def labels(self) -> Optional[Tuple[str]]:
|
||||
return []
|
||||
|
||||
|
||||
@property
|
||||
def label_data(self):
|
||||
"""Optional JSON-serializable data that would be sufficient to recreate
|
||||
|
@ -207,7 +207,7 @@ cdef class Pipe:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
||||
"""
|
||||
raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))
|
||||
pass
|
||||
|
||||
def _ensure_examples(self, get_examples):
|
||||
if get_examples is None or not hasattr(get_examples, "__call__"):
|
||||
|
|
|
@ -14,8 +14,8 @@ from ..util import make_tempdir
|
|||
|
||||
nlp_config_string = """
|
||||
[paths]
|
||||
train = ""
|
||||
dev = ""
|
||||
train = null
|
||||
dev = null
|
||||
|
||||
[corpora]
|
||||
|
||||
|
@ -309,7 +309,7 @@ def test_config_interpolation():
|
|||
config = Config().from_str(nlp_config_string, interpolate=False)
|
||||
assert config["corpora"]["train"]["path"] == "${paths.train}"
|
||||
interpolated = config.interpolate()
|
||||
assert interpolated["corpora"]["train"]["path"] == ""
|
||||
assert interpolated["corpora"]["train"]["path"] is None
|
||||
nlp = English.from_config(config)
|
||||
assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
|
||||
# Ensure that variables are preserved in nlp config
|
||||
|
@ -317,10 +317,10 @@ def test_config_interpolation():
|
|||
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||
interpolated2 = nlp.config.interpolate()
|
||||
assert interpolated2["corpora"]["train"]["path"] == ""
|
||||
assert interpolated2["corpora"]["train"]["path"] is None
|
||||
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||
nlp2 = English.from_config(interpolated)
|
||||
assert nlp2.config["corpora"]["train"]["path"] == ""
|
||||
assert nlp2.config["corpora"]["train"]["path"] is None
|
||||
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||
|
||||
|
||||
|
|
|
@ -3,10 +3,11 @@ from click import NoSuchOption
|
|||
from spacy.training import docs_to_json, offsets_to_biluo_tags
|
||||
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
from spacy.util import ENV_VARS
|
||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
||||
from spacy.cli._util import string_to_list
|
||||
from thinc.api import ConfigValidationError
|
||||
import srsly
|
||||
import os
|
||||
|
@ -342,21 +343,22 @@ def test_parse_config_overrides_invalid_2(args):
|
|||
|
||||
|
||||
def test_parse_cli_overrides():
|
||||
os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
|
||||
overrides = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
|
||||
os.environ[ENV_VARS.CONFIG_OVERRIDES] = overrides
|
||||
result = parse_config_overrides([])
|
||||
assert len(result) == 4
|
||||
assert result["x.foo"] == "bar"
|
||||
assert result["x.bar"] == 12
|
||||
assert result["x.baz"] is False
|
||||
assert result["y.foo"] == "hello"
|
||||
os.environ[OVERRIDES_ENV_VAR] = "--x"
|
||||
os.environ[ENV_VARS.CONFIG_OVERRIDES] = "--x"
|
||||
assert parse_config_overrides([], env_var=None) == {}
|
||||
with pytest.raises(SystemExit):
|
||||
parse_config_overrides([])
|
||||
os.environ[OVERRIDES_ENV_VAR] = "hello world"
|
||||
os.environ[ENV_VARS.CONFIG_OVERRIDES] = "hello world"
|
||||
with pytest.raises(SystemExit):
|
||||
parse_config_overrides([])
|
||||
del os.environ[OVERRIDES_ENV_VAR]
|
||||
del os.environ[ENV_VARS.CONFIG_OVERRIDES]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||
|
|
|
@ -7,7 +7,7 @@ import srsly
|
|||
from .. import util
|
||||
from .augment import dont_augment
|
||||
from .example import Example
|
||||
from ..errors import Warnings
|
||||
from ..errors import Warnings, Errors
|
||||
from ..tokens import DocBin, Doc
|
||||
from ..vocab import Vocab
|
||||
|
||||
|
@ -20,12 +20,14 @@ FILE_TYPE = ".spacy"
|
|||
|
||||
@util.registry.readers("spacy.Corpus.v1")
|
||||
def create_docbin_reader(
|
||||
path: Path,
|
||||
path: Optional[Path],
|
||||
gold_preproc: bool,
|
||||
max_length: int = 0,
|
||||
limit: int = 0,
|
||||
augmenter: Optional[Callable] = None,
|
||||
) -> Callable[["Language"], Iterable[Example]]:
|
||||
if path is None:
|
||||
raise ValueError(Errors.E913)
|
||||
util.logger.debug(f"Loading corpus from path: {path}")
|
||||
return Corpus(
|
||||
path,
|
||||
|
|
|
@ -67,10 +67,14 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
|
|||
# fmt: on
|
||||
|
||||
|
||||
logging.basicConfig()
|
||||
logging.basicConfig(format="%(message)s")
|
||||
logger = logging.getLogger("spacy")
|
||||
|
||||
|
||||
class ENV_VARS:
|
||||
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
|
||||
|
||||
|
||||
class registry(thinc.registry):
|
||||
languages = catalogue.create("spacy", "languages", entry_points=True)
|
||||
architectures = catalogue.create("spacy", "architectures", entry_points=True)
|
||||
|
|
|
@ -32,14 +32,16 @@ streaming.
|
|||
> gold_preproc = false
|
||||
> max_length = 0
|
||||
> limit = 0
|
||||
> augmenter = null
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ |
|
||||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
|
||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ |
|
||||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
|
||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/training/corpus.py
|
||||
|
@ -74,7 +76,7 @@ train/test skew.
|
|||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ |
|
||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~
|
||||
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ |
|
||||
|
||||
## Corpus.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -191,16 +191,16 @@ browser. Will run a simple web server.
|
|||
> displacy.serve([doc1, doc2], style="dep")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
||||
| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
|
||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||
| Name | Description |
|
||||
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
||||
| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
|
||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
|
||||
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
|
||||
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
|
||||
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
|
||||
|
||||
### displacy.render {#displacy.render tag="method" new="2"}
|
||||
|
||||
|
@ -223,7 +223,7 @@ Render a dependency parse tree or named entity visualization.
|
|||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
|
||||
| **RETURNS** | The rendered HTML markup. ~~str~~ |
|
||||
|
||||
|
@ -244,7 +244,7 @@ If a setting is not present in the options, the default value will be used.
|
|||
| Name | Description |
|
||||
| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
|
@ -498,12 +498,13 @@ the [`Corpus`](/api/corpus) class.
|
|||
> limit = 0
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ |
|
||||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
|
||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~ |
|
||||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
|
||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `augmenter` | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
|
||||
|
||||
### JsonlReader {#jsonlreader}
|
||||
|
||||
|
@ -935,7 +936,7 @@ Compile a sequence of prefix rules into a regex object.
|
|||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
|
||||
| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
|
||||
### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
|
||||
|
||||
|
@ -952,7 +953,7 @@ Compile a sequence of suffix rules into a regex object.
|
|||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
|
||||
| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
|
||||
### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
|
||||
|
||||
|
@ -969,7 +970,7 @@ Compile a sequence of infix rules into a regex object.
|
|||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
|
||||
| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
|
||||
### util.minibatch {#util.minibatch tag="function" new="2"}
|
||||
|
||||
|
|
93
website/docs/images/lifecycle.svg
Normal file
93
website/docs/images/lifecycle.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 83 KiB |
|
@ -32,7 +32,7 @@ the [config](/usage/training#config):
|
|||
|
||||
```ini
|
||||
[nlp]
|
||||
pipeline = ["tagger", "parser", "ner"]
|
||||
pipeline = ["tok2vec", "tagger", "parser", "ner"]
|
||||
```
|
||||
|
||||
import Accordion from 'components/accordion.js'
|
||||
|
|
|
@ -167,8 +167,8 @@ the binary data:
|
|||
```python
|
||||
### spacy.load under the hood
|
||||
lang = "en"
|
||||
pipeline = ["tagger", "parser", "ner"]
|
||||
data_path = "path/to/en_core_web_sm/en_core_web_sm-2.0.0"
|
||||
pipeline = ["tok2vec", "tagger", "parser", "ner"]
|
||||
data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
|
||||
|
||||
cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English
|
||||
nlp = cls() # 2. Initialize it
|
||||
|
@ -197,9 +197,9 @@ list of human-readable component names.
|
|||
|
||||
```python
|
||||
print(nlp.pipeline)
|
||||
# [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
|
||||
# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
|
||||
print(nlp.pipe_names)
|
||||
# ['tagger', 'parser', 'ner']
|
||||
# ['tok2vec', 'tagger', 'parser', 'ner']
|
||||
```
|
||||
|
||||
### Built-in pipeline components {#built-in}
|
||||
|
@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
|
|||
customize how the model is updated from examples, how it's initialized, how the
|
||||
loss is calculated and to add evaluation scores to the training output.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
||||
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
|
||||
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
||||
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
||||
| Name | Description |
|
||||
| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
||||
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
|
||||
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
||||
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
||||
|
||||
<Infobox title="Custom trainable components and models" emoji="📖">
|
||||
|
||||
|
|
|
@ -6,8 +6,9 @@ menu:
|
|||
- ['Introduction', 'basics']
|
||||
- ['Quickstart', 'quickstart']
|
||||
- ['Config System', 'config']
|
||||
<!-- - ['Data Utilities', 'data'] -->
|
||||
- ['Custom Training', 'config-custom']
|
||||
- ['Custom Functions', 'custom-functions']
|
||||
- ['Data Utilities', 'data']
|
||||
- ['Parallel Training', 'parallel-training']
|
||||
- ['Internal API', 'api']
|
||||
---
|
||||
|
@ -122,7 +123,7 @@ treebank.
|
|||
|
||||
</Project>
|
||||
|
||||
## Training config {#config}
|
||||
## Training config system {#config}
|
||||
|
||||
Training config files include all **settings and hyperparameters** for training
|
||||
your pipeline. Instead of providing lots of arguments on the command line, you
|
||||
|
@ -177,6 +178,7 @@ sections of a config file are:
|
|||
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
|
||||
| `training` | Settings and controls for the training and evaluation process. |
|
||||
| `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining). |
|
||||
| `initialize` | Data resources and arguments passed to components when [`nlp.initialize`](/api/language#initialize) is called before training (but not at runtime). |
|
||||
|
||||
<Infobox title="Config format and settings" emoji="📖">
|
||||
|
||||
|
@ -190,6 +192,20 @@ available for the different architectures are documented with the
|
|||
|
||||
</Infobox>
|
||||
|
||||
### Config lifecycle at runtime and training {#config-lifecycle}
|
||||
|
||||
A pipeline's `config.cfg` is considered the "single source of truth", both at
|
||||
**training** and **runtime**. Under the hood,
|
||||
[`Language.from_config`](/api/language#from_config) takes care of constructing
|
||||
the `nlp` object using the settings defined in the config. An `nlp` object's
|
||||
config is available as [`nlp.config`](/api/language#config) and it includes all
|
||||
information about the pipeline, as well as the settings used to train and
|
||||
initialize it.
|
||||
|
||||
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
|
||||
|
||||
<!-- TODO: explain lifecycle and initialization -->
|
||||
|
||||
### Overwriting config settings on the command line {#config-overrides}
|
||||
|
||||
The config system means that you can define all settings **in one place** and in
|
||||
|
@ -233,6 +249,61 @@ defined in the config file.
|
|||
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
|
||||
```
|
||||
|
||||
### Using variable interpolation {#config-interpolation}
|
||||
|
||||
Another very useful feature of the config system is that it supports variable
|
||||
interpolation for both **values and sections**. This means that you only need to
|
||||
define a setting once and can reference it across your config using the
|
||||
`${section.value}` syntax. In this example, the value of `seed` is reused within
|
||||
the `[training]` block, and the whole block of `[training.optimizer]` is reused
|
||||
in `[pretraining]` and will become `pretraining.optimizer`.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="5,18"}
|
||||
[system]
|
||||
seed = 0
|
||||
|
||||
[training]
|
||||
seed = ${system.seed}
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = false
|
||||
eps = 1e-8
|
||||
|
||||
[pretraining]
|
||||
optimizer = ${training.optimizer}
|
||||
```
|
||||
|
||||
You can also use variables inside strings. In that case, it works just like
|
||||
f-strings in Python. If the value of a variable is not a string, it's converted
|
||||
to a string.
|
||||
|
||||
```ini
|
||||
[paths]
|
||||
version = 5
|
||||
root = "/Users/you/data"
|
||||
train = "${paths.root}/train_${paths.version}.spacy"
|
||||
# Result: /Users/you/data/train_5.spacy
|
||||
```
|
||||
|
||||
<Infobox title="Tip: Override variables on the CLI" emoji="💡">
|
||||
|
||||
If you need to change certain values between training runs, you can define them
|
||||
once, reference them as variables and then [override](#config-overrides) them on
|
||||
the CLI. For example, `--paths.root /other/root` will change the value of `root`
|
||||
in the block `[paths]` and the change will be reflected across all other values
|
||||
that reference this variable.
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Customizing the pipeline and training {#config-custom}
|
||||
|
||||
### Defining pipeline components {#config-components}
|
||||
|
||||
You typically train a [pipeline](/usage/processing-pipelines) of **one or more
|
||||
|
@ -353,59 +424,6 @@ stop = 1000
|
|||
compound = 1.001
|
||||
```
|
||||
|
||||
### Using variable interpolation {#config-interpolation}
|
||||
|
||||
Another very useful feature of the config system is that it supports variable
|
||||
interpolation for both **values and sections**. This means that you only need to
|
||||
define a setting once and can reference it across your config using the
|
||||
`${section.value}` syntax. In this example, the value of `seed` is reused within
|
||||
the `[training]` block, and the whole block of `[training.optimizer]` is reused
|
||||
in `[pretraining]` and will become `pretraining.optimizer`.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="5,18"}
|
||||
[system]
|
||||
seed = 0
|
||||
|
||||
[training]
|
||||
seed = ${system.seed}
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = false
|
||||
eps = 1e-8
|
||||
|
||||
[pretraining]
|
||||
optimizer = ${training.optimizer}
|
||||
```
|
||||
|
||||
You can also use variables inside strings. In that case, it works just like
|
||||
f-strings in Python. If the value of a variable is not a string, it's converted
|
||||
to a string.
|
||||
|
||||
```ini
|
||||
[paths]
|
||||
version = 5
|
||||
root = "/Users/you/data"
|
||||
train = "${paths.root}/train_${paths.version}.spacy"
|
||||
# Result: /Users/you/data/train_5.spacy
|
||||
```
|
||||
|
||||
<Infobox title="Tip: Override variables on the CLI" emoji="💡">
|
||||
|
||||
If you need to change certain values between training runs, you can define them
|
||||
once, reference them as variables and then [override](#config-overrides) them on
|
||||
the CLI. For example, `--paths.root /other/root` will change the value of `root`
|
||||
in the block `[paths]` and the change will be reflected across all other values
|
||||
that reference this variable.
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Model architectures {#model-architectures}
|
||||
|
||||
> #### 💡 Model type annotations
|
||||
|
@ -506,17 +524,7 @@ still look good.
|
|||
|
||||
</Accordion>
|
||||
|
||||
<!--
|
||||
## Data Utilities {#data-utilities}
|
||||
|
||||
* spacy convert
|
||||
* The [corpora] block
|
||||
* Custom corpus class
|
||||
* Minibatching
|
||||
* Data augmentation
|
||||
-->
|
||||
|
||||
## Custom Functions {#custom-functions}
|
||||
## Custom functions {#custom-functions}
|
||||
|
||||
Registered functions in the training config files can refer to built-in
|
||||
implementations, but you can also plug in fully **custom implementations**. All
|
||||
|
@ -763,7 +771,96 @@ start = 2
|
|||
factor = 1.005
|
||||
```
|
||||
|
||||
#### Example: Custom data reading and batching {#custom-code-readers-batchers}
|
||||
### Defining custom architectures {#custom-architectures}
|
||||
|
||||
Built-in pipeline components such as the tagger or named entity recognizer are
|
||||
constructed with default neural network [models](/api/architectures). You can
|
||||
change the model architecture entirely by implementing your own custom models
|
||||
and providing those in the config when creating the pipeline component. See the
|
||||
documentation on [layers and model architectures](/usage/layers-architectures)
|
||||
for more details.
|
||||
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [components.tagger]
|
||||
> factory = "tagger"
|
||||
>
|
||||
> [components.tagger.model]
|
||||
> @architectures = "custom_neural_network.v1"
|
||||
> output_width = 512
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
from typing import List
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model
|
||||
import spacy
|
||||
from spacy.tokens import Doc
|
||||
|
||||
@spacy.registry.architectures("custom_neural_network.v1")
|
||||
def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||
return create_model(output_width)
|
||||
```
|
||||
|
||||
## Data utilities {#data}
|
||||
|
||||
spaCy includes various features and utilities to make it easy to train from your
|
||||
own data. If you have training data in a standard format like `.conll` or
|
||||
`.conllu`, the easiest way to convert it for use with spaCy is to run
|
||||
[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
|
||||
|
||||
```cli
|
||||
$ python -m spacy convert ./train.gold.conll ./corpus
|
||||
```
|
||||
|
||||
<Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
|
||||
|
||||
Training workflows often consist of multiple steps, from preprocessing the data
|
||||
all the way to packaging and deploying the trained model.
|
||||
[spaCy projects](/usage/projects) let you define all steps in one file, manage
|
||||
data assets, track changes and share your end-to-end processes with your team.
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Working with corpora {#data-corpora}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```ini
|
||||
> [corpora]
|
||||
>
|
||||
> [corpora.train]
|
||||
> @readers = "spacy.Corpus.v1"
|
||||
> path = ${paths.train}
|
||||
> gold_preproc = false
|
||||
> max_length = 0
|
||||
> limit = 0
|
||||
> augmenter = null
|
||||
>
|
||||
> [training]
|
||||
> train_corpus = "corpora.train"
|
||||
> ```
|
||||
|
||||
The [`[corpora]`](/api/data-formats#config-corpora) block in your config lets
|
||||
you define **data resources** to use for training, evaluation, pretraining or
|
||||
any other custom workflows. `corpora.train` and `corpora.dev` are used as
|
||||
conventions within spaCy's default configs, but you can also define any other
|
||||
custom blocks. Each section in the corpora config should resolve to a
|
||||
[`Corpus`](/api/corpus) – for example, using spaCy's built-in
|
||||
[corpus reader](/api/top-level#readers) that takes a path to a binary `.spacy`
|
||||
file. The `train_corpus` and `dev_corpus` fields in the
|
||||
[`[training]`](/api/data-formats#config-training) block specify where to find
|
||||
the corpus in your config. This makes it easy to **swap out** different corpora
|
||||
by only changing a single config setting.
|
||||
|
||||
Instead of making `[corpora]` a block with multiple subsections for each portion
|
||||
of the data, you can also use a single function that returns a dictionary of
|
||||
corpora, keyed by corpus name, e.g. `"train"` and `"dev"`. This can be
|
||||
especially useful if you need to split a single file into corpora for training
|
||||
and evaluation, without loading the same file twice.
|
||||
|
||||
### Custom data reading and batching {#custom-code-readers-batchers}
|
||||
|
||||
Some use-cases require **streaming in data** or manipulating datasets on the
|
||||
fly, rather than generating all data beforehand and storing it to file. Instead
|
||||
|
@ -859,37 +956,11 @@ def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Examp
|
|||
return create_filtered_batches
|
||||
```
|
||||
|
||||
### Defining custom architectures {#custom-architectures}
|
||||
|
||||
Built-in pipeline components such as the tagger or named entity recognizer are
|
||||
constructed with default neural network [models](/api/architectures). You can
|
||||
change the model architecture entirely by implementing your own custom models
|
||||
and providing those in the config when creating the pipeline component. See the
|
||||
documentation on [layers and model architectures](/usage/layers-architectures)
|
||||
for more details.
|
||||
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [components.tagger]
|
||||
> factory = "tagger"
|
||||
>
|
||||
> [components.tagger.model]
|
||||
> @architectures = "custom_neural_network.v1"
|
||||
> output_width = 512
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
from typing import List
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model
|
||||
import spacy
|
||||
from spacy.tokens import Doc
|
||||
|
||||
@spacy.registry.architectures("custom_neural_network.v1")
|
||||
def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||
return create_model(output_width)
|
||||
```
|
||||
<!-- TODO:
|
||||
* Custom corpus class
|
||||
* Minibatching
|
||||
* Data augmentation
|
||||
-->
|
||||
|
||||
## Parallel & distributed training with Ray {#parallel-training}
|
||||
|
||||
|
|
|
@ -123,13 +123,14 @@ training run, with no hidden defaults, making it easy to rerun your experiments
|
|||
and track changes. You can use the
|
||||
[quickstart widget](/usage/training#quickstart) or the `init config` command to
|
||||
get started. Instead of providing lots of arguments on the command line, you
|
||||
only need to pass your `config.cfg` file to `spacy train`.
|
||||
|
||||
only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
|
||||
Training config files include all **settings and hyperparameters** for training
|
||||
your pipeline. Some settings can also be registered **functions** that you can
|
||||
swap out and customize, making it easy to implement your own custom models and
|
||||
architectures.
|
||||
|
||||
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
|
||||
|
||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||
|
||||
- **Usage:** [Training pipelines and models](/usage/training)
|
||||
|
@ -723,7 +724,7 @@ nlp = spacy.blank("en")
|
|||
|
||||
Because pipeline components are now added using their string names, you won't
|
||||
have to instantiate the [component classes](/api/#architecture-pipeline)
|
||||
directly anynore. To configure the component, you can now use the `config`
|
||||
directly anymore. To configure the component, you can now use the `config`
|
||||
argument on [`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
> #### config.cfg (excerpt)
|
||||
|
|
Loading…
Reference in New Issue
Block a user