Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-10-06 14:37:58 +02:00
commit 32011780a2
36 changed files with 401 additions and 228 deletions

View File

@ -104,9 +104,11 @@ For detailed installation instructions, see the
### pip ### pip
Using pip, spaCy releases are available as source packages and binary wheels (as Using pip, spaCy releases are available as source packages and binary wheels (as
of `v2.0.13`). of `v2.0.13`). Before you install spaCy and its dependencies, make sure that
your `pip`, `setuptools` and `wheel` are up to date.
```bash ```bash
pip install -U pip setuptools wheel
pip install spacy pip install spacy
``` ```

View File

@ -18,7 +18,7 @@ pydantic>=1.5.0,<2.0.0
pytokenizations pytokenizations
# Official Python utilities # Official Python utilities
setuptools setuptools
packaging packaging>=20.0
importlib_metadata>=0.20; python_version < "3.8" importlib_metadata>=0.20; python_version < "3.8"
typing_extensions>=3.7.4; python_version < "3.8" typing_extensions>=3.7.4; python_version < "3.8"
# Development dependencies # Development dependencies

View File

@ -55,7 +55,7 @@ install_requires =
pytokenizations pytokenizations
# Official Python utilities # Official Python utilities
setuptools setuptools
packaging packaging>=20.0
importlib_metadata>=0.20; python_version < "3.8" importlib_metadata>=0.20; python_version < "3.8"
typing_extensions>=3.7.4; python_version < "3.8" typing_extensions>=3.7.4; python_version < "3.8"

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a34" __version__ = "3.0.0a35"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -278,7 +278,7 @@ def show_validation_error(
"fill-config' command to fill in all the defaults, if possible:", "fill-config' command to fill in all the defaults, if possible:",
spaced=True, spaced=True,
) )
print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n") print(f"{COMMAND} init fill-config {config_path} {config_path} \n")
sys.exit(1) sys.exit(1)
except InterpolationError as e: except InterpolationError as e:
msg.fail("Config validation error", e, exits=1) msg.fail("Config validation error", e, exits=1)

View File

@ -1091,10 +1091,11 @@ class Language:
for name, proc in self.pipeline: for name, proc in self.pipeline:
if ( if (
name not in exclude name not in exclude
and hasattr(proc, "model") and hasattr(proc, "is_trainable")
and proc.is_trainable()
and proc.model not in (True, False, None) and proc.model not in (True, False, None)
): ):
proc.model.finish_update(sgd) proc.finish_update(sgd)
return losses return losses
def rehearse( def rehearse(
@ -1297,7 +1298,9 @@ class Language:
for name, pipe in self.pipeline: for name, pipe in self.pipeline:
kwargs = component_cfg.get(name, {}) kwargs = component_cfg.get(name, {})
kwargs.setdefault("batch_size", batch_size) kwargs.setdefault("batch_size", batch_size)
if not hasattr(pipe, "pipe"): # non-trainable components may have a pipe() implementation that refers to dummy
# predict and set_annotations methods
if not hasattr(pipe, "pipe") or not hasattr(pipe, "is_trainable") or not pipe.is_trainable():
docs = _pipe(docs, pipe, kwargs) docs = _pipe(docs, pipe, kwargs)
else: else:
docs = pipe.pipe(docs, **kwargs) docs = pipe.pipe(docs, **kwargs)
@ -1407,7 +1410,9 @@ class Language:
kwargs = component_cfg.get(name, {}) kwargs = component_cfg.get(name, {})
# Allow component_cfg to overwrite the top-level kwargs. # Allow component_cfg to overwrite the top-level kwargs.
kwargs.setdefault("batch_size", batch_size) kwargs.setdefault("batch_size", batch_size)
if hasattr(proc, "pipe"): # non-trainable components may have a pipe() implementation that refers to dummy
# predict and set_annotations methods
if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable():
f = functools.partial(proc.pipe, **kwargs) f = functools.partial(proc.pipe, **kwargs)
else: else:
# Apply the function, but yield the doc # Apply the function, but yield the doc

View File

@ -34,7 +34,7 @@ def StaticVectors(
def forward( def forward(
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
) -> Tuple[Ragged, Callable]: ) -> Tuple[Ragged, Callable]:
if not len(docs): if not sum(len(doc) for doc in docs):
return _handle_empty(model.ops, model.get_dim("nO")) return _handle_empty(model.ops, model.get_dim("nO"))
key_attr = model.attrs["key_attr"] key_attr = model.attrs["key_attr"]
W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))

View File

@ -238,7 +238,7 @@ class EntityLinker(Pipe):
) )
bp_context(d_scores) bp_context(d_scores)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.finish_update(sgd)
losses[self.name] += loss losses[self.name] += loss
if set_annotations: if set_annotations:
self.set_annotations(docs, predictions) self.set_annotations(docs, predictions)

View File

@ -1,8 +1,10 @@
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
import srsly import srsly
from .pipe import Pipe
from ..training import Example
from ..language import Language from ..language import Language
from ..errors import Errors from ..errors import Errors
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
@ -50,7 +52,7 @@ def make_entity_ruler(
) )
class EntityRuler: class EntityRuler(Pipe):
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based """The EntityRuler lets you add spans to the `Doc.ents` using token-based
rules or exact phrase matches. It can be combined with the statistical rules or exact phrase matches. It can be combined with the statistical
`EntityRecognizer` to boost accuracy, or used on its own to implement a `EntityRecognizer` to boost accuracy, or used on its own to implement a
@ -183,6 +185,26 @@ class EntityRuler:
all_labels.add(l) all_labels.add(l)
return tuple(all_labels) return tuple(all_labels)
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
patterns: Optional[Sequence[PatternType]] = None,
):
"""Initialize the pipe for training.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
patterns Optional[Iterable[PatternType]]: The list of patterns.
DOCS: https://nightly.spacy.io/api/entityruler#initialize
"""
if patterns:
self.add_patterns(patterns)
@property @property
def ent_ids(self) -> Tuple[str, ...]: def ent_ids(self) -> Tuple[str, ...]:
"""All entity ids present in the match patterns `id` properties """All entity ids present in the match patterns `id` properties
@ -320,6 +342,12 @@ class EntityRuler:
validate_examples(examples, "EntityRuler.score") validate_examples(examples, "EntityRuler.score")
return Scorer.score_spans(examples, "ents", **kwargs) return Scorer.score_spans(examples, "ents", **kwargs)
def predict(self, docs):
pass
def set_annotations(self, docs, scores):
pass
def from_bytes( def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler": ) -> "EntityRuler":

View File

@ -209,7 +209,7 @@ class ClozeMultitask(Pipe):
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
bp_predictions(d_predictions) bp_predictions(d_predictions)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.finish_update(sgd)
if losses is not None: if losses is not None:
losses[self.name] += loss losses[self.name] += loss
return losses return losses

View File

@ -132,7 +132,7 @@ cdef class Pipe:
loss, d_scores = self.get_loss(examples, scores) loss, d_scores = self.get_loss(examples, scores)
bp_scores(d_scores) bp_scores(d_scores)
if sgd not in (None, False): if sgd not in (None, False):
self.model.finish_update(sgd) self.finish_update(sgd)
losses[self.name] += loss losses[self.name] += loss
if set_annotations: if set_annotations:
docs = [eg.predicted for eg in examples] docs = [eg.predicted for eg in examples]
@ -228,6 +228,9 @@ cdef class Pipe:
def is_resizable(self): def is_resizable(self):
return hasattr(self, "model") and "resize_output" in self.model.attrs return hasattr(self, "model") and "resize_output" in self.model.attrs
def is_trainable(self):
return hasattr(self, "model") and isinstance(self.model, Model)
def set_output(self, nO): def set_output(self, nO):
if self.is_resizable(): if self.is_resizable():
self.model.attrs["resize_output"](self.model, nO) self.model.attrs["resize_output"](self.model, nO)
@ -245,6 +248,17 @@ cdef class Pipe:
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def finish_update(self, sgd):
"""Update parameters using the current parameter gradients.
The Optimizer instance contains the functionality to perform
the stochastic gradient descent.
sgd (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/pipe#finish_update
"""
self.model.finish_update(sgd)
def score(self, examples, **kwargs): def score(self, examples, **kwargs):
"""Score a batch of examples. """Score a batch of examples.

View File

@ -203,7 +203,7 @@ class Tagger(Pipe):
loss, d_tag_scores = self.get_loss(examples, tag_scores) loss, d_tag_scores = self.get_loss(examples, tag_scores)
bp_tag_scores(d_tag_scores) bp_tag_scores(d_tag_scores)
if sgd not in (None, False): if sgd not in (None, False):
self.model.finish_update(sgd) self.finish_update(sgd)
losses[self.name] += loss losses[self.name] += loss
if set_annotations: if set_annotations:
@ -238,7 +238,7 @@ class Tagger(Pipe):
target = self._rehearsal_model(examples) target = self._rehearsal_model(examples)
gradient = guesses - target gradient = guesses - target
backprop(gradient) backprop(gradient)
self.model.finish_update(sgd) self.finish_update(sgd)
if losses is not None: if losses is not None:
losses.setdefault(self.name, 0.0) losses.setdefault(self.name, 0.0)
losses[self.name] += (gradient**2).sum() losses[self.name] += (gradient**2).sum()

View File

@ -212,7 +212,7 @@ class TextCategorizer(Pipe):
loss, d_scores = self.get_loss(examples, scores) loss, d_scores = self.get_loss(examples, scores)
bp_scores(d_scores) bp_scores(d_scores)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.finish_update(sgd)
losses[self.name] += loss losses[self.name] += loss
if set_annotations: if set_annotations:
docs = [eg.predicted for eg in examples] docs = [eg.predicted for eg in examples]
@ -256,7 +256,7 @@ class TextCategorizer(Pipe):
gradient = scores - target gradient = scores - target
bp_scores(gradient) bp_scores(gradient)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.finish_update(sgd)
if losses is not None: if losses is not None:
losses[self.name] += (gradient ** 2).sum() losses[self.name] += (gradient ** 2).sum()
return losses return losses

View File

@ -188,7 +188,7 @@ class Tok2Vec(Pipe):
accumulate_gradient(one_d_tokvecs) accumulate_gradient(one_d_tokvecs)
d_docs = bp_tokvecs(d_tokvecs) d_docs = bp_tokvecs(d_tokvecs)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.finish_update(sgd)
return d_docs return d_docs
batch_id = Tok2VecListener.get_batch_id(docs) batch_id = Tok2VecListener.get_batch_id(docs)

View File

@ -315,7 +315,7 @@ cdef class Parser(Pipe):
backprop_tok2vec(golds) backprop_tok2vec(golds)
if sgd not in (None, False): if sgd not in (None, False):
self.model.finish_update(sgd) self.finish_update(sgd)
if set_annotations: if set_annotations:
docs = [eg.predicted for eg in examples] docs = [eg.predicted for eg in examples]
self.set_annotations(docs, all_states) self.set_annotations(docs, all_states)
@ -367,7 +367,7 @@ cdef class Parser(Pipe):
# Do the backprop # Do the backprop
backprop_tok2vec(docs) backprop_tok2vec(docs)
if sgd is not None: if sgd is not None:
self.model.finish_update(sgd) self.finish_update(sgd)
losses[self.name] += loss / n_scores losses[self.name] += loss / n_scores
del backprop del backprop
del backprop_tok2vec del backprop_tok2vec
@ -437,7 +437,9 @@ cdef class Parser(Pipe):
for name, component in nlp.pipeline: for name, component in nlp.pipeline:
if component is self: if component is self:
break break
if hasattr(component, "pipe"): # non-trainable components may have a pipe() implementation that refers to dummy
# predict and set_annotations methods
if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable():
doc_sample = list(component.pipe(doc_sample, batch_size=8)) doc_sample = list(component.pipe(doc_sample, batch_size=8))
else: else:
doc_sample = [component(doc) for doc in doc_sample] doc_sample = [component(doc) for doc in doc_sample]

View File

@ -119,7 +119,7 @@ def validate_init_settings(
if types don't match or required values are missing. if types don't match or required values are missing.
func (Callable): The initialize method of a given component etc. func (Callable): The initialize method of a given component etc.
settings (Dict[str, Any]): The settings from the repsective [initialize] block. settings (Dict[str, Any]): The settings from the respective [initialize] block.
section (str): Initialize section, for error message. section (str): Initialize section, for error message.
name (str): Name of the block in the section. name (str): Name of the block in the section.
exclude (Iterable[str]): Parameter names to exclude from schema. exclude (Iterable[str]): Parameter names to exclude from schema.

View File

@ -10,12 +10,14 @@ def test_build_dependencies():
"mock", "mock",
"flake8", "flake8",
] ]
# ignore language-specific packages that shouldn't be installed by all
libs_ignore_setup = [ libs_ignore_setup = [
"fugashi", "fugashi",
"natto-py", "natto-py",
"pythainlp", "pythainlp",
"sudachipy", "sudachipy",
"sudachidict_core", "sudachidict_core",
"spacy-pkuseg",
] ]
# check requirements.txt # check requirements.txt

View File

@ -121,7 +121,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler") nlp.remove_pipe("attribute_ruler")
# initialize with patterns from asset # initialize with patterns from misc registry
nlp.config["initialize"]["components"]["attribute_ruler"] = { nlp.config["initialize"]["components"]["attribute_ruler"] = {
"patterns": {"@misc": "attribute_ruler_patterns"} "patterns": {"@misc": "attribute_ruler_patterns"}
} }

View File

@ -1,4 +1,6 @@
import pytest import pytest
from spacy import registry
from spacy.tokens import Span from spacy.tokens import Span
from spacy.language import Language from spacy.language import Language
from spacy.pipeline import EntityRuler from spacy.pipeline import EntityRuler
@ -11,6 +13,7 @@ def nlp():
@pytest.fixture @pytest.fixture
@registry.misc("entity_ruler_patterns")
def patterns(): def patterns():
return [ return [
{"label": "HELLO", "pattern": "hello world"}, {"label": "HELLO", "pattern": "hello world"},
@ -42,6 +45,29 @@ def test_entity_ruler_init(nlp, patterns):
assert doc.ents[1].label_ == "BYE" assert doc.ents[1].label_ == "BYE"
def test_entity_ruler_init_patterns(nlp, patterns):
# initialize with patterns
ruler = nlp.add_pipe("entity_ruler")
assert len(ruler.labels) == 0
ruler.initialize(lambda: [], patterns=patterns)
assert len(ruler.labels) == 4
doc = nlp("hello world bye bye")
assert doc.ents[0].label_ == "HELLO"
assert doc.ents[1].label_ == "BYE"
nlp.remove_pipe("entity_ruler")
# initialize with patterns from misc registry
nlp.config["initialize"]["components"]["entity_ruler"] = {
"patterns": {"@misc": "entity_ruler_patterns"}
}
ruler = nlp.add_pipe("entity_ruler")
assert len(ruler.labels) == 0
nlp.initialize()
assert len(ruler.labels) == 4
doc = nlp("hello world bye bye")
assert doc.ents[0].label_ == "HELLO"
assert doc.ents[1].label_ == "BYE"
def test_entity_ruler_existing(nlp, patterns): def test_entity_ruler_existing(nlp, patterns):
ruler = nlp.add_pipe("entity_ruler") ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)

View File

@ -7,6 +7,7 @@ import numpy
from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
from spacy.ml.staticvectors import StaticVectors
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.en.examples import sentences as EN_SENTENCES from spacy.lang.en.examples import sentences as EN_SENTENCES
@ -185,3 +186,22 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
model1 = get_updated_model() model1 = get_updated_model()
model2 = get_updated_model() model2 = get_updated_model()
assert_array_equal(get_all_params(model1), get_all_params(model2)) assert_array_equal(get_all_params(model1), get_all_params(model2))
@pytest.mark.parametrize(
"model_func,kwargs",
[
(StaticVectors, {"nO": 128, "nM": 300}),
]
)
def test_empty_docs(model_func, kwargs):
nlp = English()
model = model_func(**kwargs).initialize()
# Test the layer can be called successfully with 0, 1 and 2 empty docs.
for n_docs in range(3):
docs = [nlp("") for _ in range(n_docs)]
# Test predict
_ = model.predict(docs)
# Test backprop
output, backprop = model.begin_update(docs)
_ = backprop(output)

View File

@ -49,7 +49,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
nlp.resume_training(sgd=optimizer) nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]): with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
logger.info("Initialized pipeline components") logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
return nlp return nlp

View File

@ -17,8 +17,12 @@ def console_logger(progress_bar: bool = False):
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]: ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
msg = Printer(no_print=True) msg = Printer(no_print=True)
# we assume here that only components are enabled that should be trained & logged # ensure that only trainable components are logged
logged_pipes = nlp.pipe_names logged_pipes = [
name
for name, proc in nlp.pipeline
if hasattr(proc, "is_trainable") and proc.is_trainable()
]
eval_frequency = nlp.config["training"]["eval_frequency"] eval_frequency = nlp.config["training"]["eval_frequency"]
score_weights = nlp.config["training"]["score_weights"] score_weights = nlp.config["training"]["score_weights"]
score_cols = [col for col, value in score_weights.items() if value is not None] score_cols = [col for col, value in score_weights.items() if value is not None]
@ -41,19 +45,10 @@ def console_logger(progress_bar: bool = False):
if progress is not None: if progress is not None:
progress.update(1) progress.update(1)
return return
try: losses = [
losses = [ "{0:.2f}".format(float(info["losses"][pipe_name]))
"{0:.2f}".format(float(info["losses"][pipe_name])) for pipe_name in logged_pipes
for pipe_name in logged_pipes ]
]
except KeyError as e:
raise KeyError(
Errors.E983.format(
dict="scores (losses)",
key=str(e),
keys=list(info["losses"].keys()),
)
) from None
scores = [] scores = []
for col in score_cols: for col in score_cols:

View File

@ -187,10 +187,11 @@ def train_while_improving(
for name, proc in nlp.pipeline: for name, proc in nlp.pipeline:
if ( if (
name not in exclude name not in exclude
and hasattr(proc, "model") and hasattr(proc, "is_trainable")
and proc.is_trainable()
and proc.model not in (True, False, None) and proc.model not in (True, False, None)
): ):
proc.model.finish_update(optimizer) proc.finish_update(optimizer)
optimizer.step_schedules() optimizer.step_schedules()
if not (step % eval_frequency): if not (step % eval_frequency):
if optimizer.averages: if optimizer.averages:
@ -293,7 +294,8 @@ def update_meta(
if metric is not None: if metric is not None:
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
for pipe_name in nlp.pipe_names: for pipe_name in nlp.pipe_names:
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] if pipe_name in info["losses"]:
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
def create_before_to_disk_callback( def create_before_to_disk_callback(

View File

@ -128,8 +128,8 @@ Get all patterns that have been added to the attribute ruler in the
## AttributeRuler.initialize {#initialize tag="method"} ## AttributeRuler.initialize {#initialize tag="method"}
Initialize the component with data. Typically called before training to load in Initialize the component with data and used before training to load in rules
rules from a file. This method is typically called by from a file. This method is typically called by
[`Language.initialize`](/api/language#initialize) and lets you customize [`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the [`[initialize.components]`](/api/data-formats#config-initialize) block in the

View File

@ -300,17 +300,16 @@ $ python -m spacy debug config [config_path] [--code] [--show-functions] [--show
``` ```
✘ Config validation error ✘ Config validation error
dropout field required
optimizer field required
optimize extra fields not permitted
training -> dropout field required {'seed': 0, 'accumulate_gradient': 1, 'dev_corpus': 'corpora.dev', 'train_corpus': 'corpora.train', 'gpu_allocator': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'before_to_disk': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'logger': {'@loggers': 'spacy.ConsoleLogger.v1', 'progress_bar': False}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
training -> optimizer field required
training -> optimize extra fields not permitted
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
If your config contains missing values, you can run the 'init fill-config' If your config contains missing values, you can run the 'init fill-config'
command to fill in all the defaults, if possible: command to fill in all the defaults, if possible:
python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/starter-config_invalid.cfg python -m spacy init fill-config tmp/starter-config_invalid.cfg tmp/starter-config_invalid.cfg
``` ```
</Accordion> </Accordion>

View File

@ -180,24 +180,24 @@ single corpus once and then divide it up into `train` and `dev` partitions.
This section defines settings and controls for the training and evaluation This section defines settings and controls for the training and evaluation
process that are used when you run [`spacy train`](/api/cli#train). process that are used when you run [`spacy train`](/api/cli#train).
| Name | Description | | Name | Description |
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | | `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | | `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | | `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | | `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | | `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | | `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | | `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | | `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | | `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | | `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | | `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | | `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
### pretraining {#config-pretraining tag="section,optional"} ### pretraining {#config-pretraining tag="section,optional"}
@ -205,17 +205,17 @@ This section is optional and defines settings and controls for
[language model pretraining](/usage/embeddings-transformers#pretraining). It's [language model pretraining](/usage/embeddings-transformers#pretraining). It's
used when you run [`spacy pretrain`](/api/cli#pretrain). used when you run [`spacy pretrain`](/api/cli#pretrain).
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | | `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | | `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | | `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | | `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | | `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ | | `corpus` | Dot notation of the config location defining the corpus with raw text. Defaults to `corpora.pretrain`. ~~str~~ |
| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | | `component` | Component name to identify the layer with the model to pretrain. Defaults to `"tok2vec"`. ~~str~~ |
| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | | `layer` | The specific layer of the model to pretrain. If empty, the whole model will be used. ~~str~~ |
### initialize {#config-initialize tag="section"} ### initialize {#config-initialize tag="section"}
@ -224,6 +224,9 @@ It's used by [`Language.initialize`](/api/language#initialize) and typically
called right before training (but not at runtime). The section allows you to called right before training (but not at runtime). The section allows you to
specify local file paths or custom functions to load data resources from, specify local file paths or custom functions to load data resources from,
without requiring them at runtime when you load the trained pipeline back in. without requiring them at runtime when you load the trained pipeline back in.
Also see the usage guides on the
[config lifecycle](/usage/training#config-lifecycle) and
[custom initialization](/usage/training#initialization).
> #### Example > #### Example
> >

View File

@ -74,6 +74,38 @@ be a token pattern (list) or a phrase pattern (string). For example:
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ |
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ | | `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
## EntityRuler.initialize {#initialize tag="method" new="3"}
Initialize the component with data and used before training to load in rules
from a file. This method is typically called by
[`Language.initialize`](/api/language#initialize) and lets you customize
arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
> #### Example
>
> ```python
> entity_ruler = nlp.add_pipe("entity_ruler")
> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.entity_ruler]
>
> [initialize.components.entity_ruler.patterns]
> @readers = "srsly.read_jsonl.v1"
> path = "corpus/entity_ruler_patterns.jsonl
> ```
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `patterns` | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~ |
## EntityRuler.\_\len\_\_ {#len tag="method"} ## EntityRuler.\_\len\_\_ {#len tag="method"}
The number of all patterns added to the entity ruler. The number of all patterns added to the entity ruler.
@ -177,7 +209,7 @@ only the patterns are saved as JSONL. If a directory name is provided, a
## EntityRuler.from_disk {#from_disk tag="method"} ## EntityRuler.from_disk {#from_disk tag="method"}
Load the entity ruler from a file. Expects either a file containing Load the entity ruler from a path. Expects either a file containing
newline-delimited JSON (JSONL) with one entry per line, or a directory newline-delimited JSON (JSONL) with one entry per line, or a directory
containing a `patterns.jsonl` file and a `cfg` file with the component containing a `patterns.jsonl` file and a `cfg` file with the component
configuration. configuration.
@ -256,6 +288,6 @@ Get all patterns that were added to the entity ruler.
| Name | Description | | Name | Description |
| ----------------- | --------------------------------------------------------------------------------------------------------------------- | | ----------------- | --------------------------------------------------------------------------------------------------------------------- |
| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ | | `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ |
| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ | | `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ |
| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ | | `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ | | `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ |

View File

@ -294,6 +294,24 @@ context, the original parameters are restored.
| -------- | -------------------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | The parameter values to use in the model. ~~dict~~ | | `params` | The parameter values to use in the model. ~~dict~~ |
## Pipe.finish_update {#finish_update tag="method"}
Update parameters using the current parameter gradients. Defaults to calling
[`self.model.finish_update`](https://thinc.ai/docs/api-model#finish_update).
> #### Example
>
> ```python
> pipe = nlp.add_pipe("your_custom_pipe")
> optimizer = nlp.initialize()
> losses = pipe.update(examples, sgd=None)
> pipe.finish_update(sgd)
> ```
| Name | Description |
| ----- | ------------------------------------- |
| `sgd` | An optimizer. ~~Optional[Optimizer]~~ |
## Pipe.add_label {#add_label tag="method"} ## Pipe.add_label {#add_label tag="method"}
> #### Example > #### Example

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 50 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 76 KiB

View File

@ -646,7 +646,9 @@ get_candidates = model.attrs["get_candidates"]
To use our new relation extraction model as part of a custom To use our new relation extraction model as part of a custom
[trainable component](/usage/processing-pipelines#trainable-components), we [trainable component](/usage/processing-pipelines#trainable-components), we
create a subclass of [`Pipe`](/api/pipe) that holds the model: create a subclass of [`Pipe`](/api/pipe) that holds the model.
![Illustration of Pipe methods](../images/trainable_component.svg)
```python ```python
### Pipeline component skeleton ### Pipeline component skeleton
@ -826,7 +828,7 @@ def __call__(self, Doc doc):
Once our `Pipe` subclass is fully implemented, we can Once our `Pipe` subclass is fully implemented, we can
[register](/usage/processing-pipelines#custom-components-factories) the [register](/usage/processing-pipelines#custom-components-factories) the
component with the [`@Language.factory`](/api/lnguage#factory) decorator. This component with the [`@Language.factory`](/api/language#factory) decorator. This
assigns it a name and lets you create the component with assigns it a name and lets you create the component with
[`nlp.add_pipe`](/api/language#add_pipe) and via the [`nlp.add_pipe`](/api/language#add_pipe) and via the
[config](/usage/training#config). [config](/usage/training#config).

View File

@ -98,10 +98,10 @@ The Chinese language class supports three word segmentation options, `char`,
> # Jieba > # Jieba
> cfg = {"segmenter": "jieba"} > cfg = {"segmenter": "jieba"}
> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}}) > nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
> # PKUSeg with "default" model provided by pkuseg > # PKUSeg with "mixed" model provided by pkuseg
> cfg = {"segmenter": "pkuseg"} > cfg = {"segmenter": "pkuseg"}
> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}}) > nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
> nlp.tokenizer.initialize(pkuseg_model="default") > nlp.tokenizer.initialize(pkuseg_model="mixed")
> ``` > ```
```ini ```ini
@ -115,7 +115,7 @@ segmenter = "char"
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `char` | **Character segmentation:** Character segmentation is the default segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`. | | `char` | **Character segmentation:** Character segmentation is the default segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`. |
| `jieba` | **Jieba:** to use [Jieba](https://github.com/fxsjy/jieba) for word segmentation, you can set the option `segmenter` to `"jieba"`. | | `jieba` | **Jieba:** to use [Jieba](https://github.com/fxsjy/jieba) for word segmentation, you can set the option `segmenter` to `"jieba"`. |
| `pkuseg` | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. | | `pkuseg` | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/explosion/spacy-pkuseg) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. |
<Infobox title="Changed in v3.0" variant="warning"> <Infobox title="Changed in v3.0" variant="warning">
@ -133,10 +133,10 @@ runtime.
The `initialize` method for the Chinese tokenizer class supports the following The `initialize` method for the Chinese tokenizer class supports the following
config settings for loading `pkuseg` models: config settings for loading `pkuseg` models:
| Name | Description | | Name | Description |
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `pkuseg_model` | Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ | | `pkuseg_model` | Name of a model provided by `spacy-pkuseg` or the path to a local model directory. ~~str~~ |
| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`. ~~str~~ | | `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`, the default provided dictionary. ~~str~~ |
The initialization settings are typically provided in the The initialization settings are typically provided in the
[training config](/usage/training#config) and the data is loaded in before [training config](/usage/training#config) and the data is loaded in before
@ -164,14 +164,17 @@ You can also initialize the tokenizer for a blank language class by calling its
cfg = {"segmenter": "pkuseg"} cfg = {"segmenter": "pkuseg"}
nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}}) nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
# Load "default" model # Load spaCy's OntoNotes model
nlp.tokenizer.initialize(pkuseg_model="default") nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes")
# Load pkuseg's "news" model
nlp.tokenizer.initialize(pkuseg_model="news")
# Load local model # Load local model
nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model") nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
# Override the user directory # Override the user directory
nlp.tokenizer.initialize(pkuseg_model="default", pkuseg_user_dict="/path/to/user_dict") nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes", pkuseg_user_dict="/path/to/user_dict")
``` ```
You can also modify the user dictionary on-the-fly: You can also modify the user dictionary on-the-fly:
@ -195,13 +198,13 @@ The [Chinese pipelines](/models/zh) provided by spaCy include a custom `pkuseg`
model trained only on model trained only on
[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the [Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the
models provided by `pkuseg` include data restricted to research use. For models provided by `pkuseg` include data restricted to research use. For
research use, `pkuseg` provides models for several different domains research use, `pkuseg` provides models for several different domains (`"mixed"`
(`"default"`, `"news"` `"web"`, `"medicine"`, `"tourism"`) and for other uses, (equivalent to `"default"` from `pkuseg` packages), `"news"` `"web"`,
`pkuseg` provides a simple `"medicine"`, `"tourism"`) and for other uses, `pkuseg` provides a simple
[training API](https://github.com/lancopku/pkuseg-python/blob/master/readme/readme_english.md#usage): [training API](https://github.com/explosion/spacy-pkuseg/blob/master/readme/readme_english.md#usage):
```python ```python
import pkuseg import spacy_pkuseg as pkuseg
from spacy.lang.zh import Chinese from spacy.lang.zh import Chinese
# Train pkuseg model # Train pkuseg model

View File

@ -1172,13 +1172,15 @@ doc = nlp("This is a text...")
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
components that have their own model instance, make predictions over `Doc` components that have their own model instance, make predictions over `Doc`
objects and can be updated using [`spacy train`](/api/cli#train). This lets you objects and can be updated using [`spacy train`](/api/cli#train). This lets you
plug fully custom machine learning components into your pipeline. You'll need plug fully custom machine learning components into your pipeline.
the following:
![Illustration of Pipe methods](../images/trainable_component.svg)
You'll need the following:
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
can be a model implemented in can be a model implemented in [Thinc](/usage/layers-architectures#thinc), or
[Thinc](/usage/layers-architectures#thinc), or a a [wrapped model](/usage/layers-architectures#frameworks) implemented in
[wrapped model](/usage/layers-architectures#frameworks) implemented in
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
list of [`Doc`](/api/doc) objects as input and can have any type of output. list of [`Doc`](/api/doc) objects as input and can have any type of output.
2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least 2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least
@ -1283,7 +1285,7 @@ loss is calculated and to add evaluation scores to the training output.
For more details on how to implement your own trainable components and model For more details on how to implement your own trainable components and model
architectures, and plug existing models implemented in PyTorch or TensorFlow architectures, and plug existing models implemented in PyTorch or TensorFlow
into your spaCy pipeline, see the usage guide on into your spaCy pipeline, see the usage guide on
[layers and model architectures](/usage/layers-architectures). [layers and model architectures](/usage/layers-architectures#components).
</Infobox> </Infobox>

View File

@ -404,8 +404,73 @@ import Training101 from 'usage/101/\_training.md'
<Infobox title="Training pipelines and models" emoji="📖"> <Infobox title="Training pipelines and models" emoji="📖">
To learn more about **training and updating** pipelines, how to create training To learn more about **training and updating** pipelines, how to create training
data and how to improve spaCy's named entity recognition models, see the usage data and how to improve spaCy's named models, see the usage guides on
guides on [training](/usage/training). [training](/usage/training).
</Infobox>
### Training config and lifecycle {#training-config}
Training config files include all **settings and hyperparameters** for training
your pipeline. Instead of providing lots of arguments on the command line, you
only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
This also makes it easy to integrate custom models and architectures, written in
your framework of choice. A pipeline's `config.cfg` is considered the "single
source of truth", both at **training** and **runtime**.
> ```ini
> ### config.cfg (excerpt)
> [training]
> accumulate_gradient = 3
>
> [training.optimizer]
> @optimizers = "Adam.v1"
>
> [training.optimizer.learn_rate]
> @schedules = "warmup_linear.v1"
> warmup_steps = 250
> total_steps = 20000
> initial_rate = 0.01
> ```
![Illustration of pipeline lifecycle](../images/lifecycle.svg)
<Infobox title="Training configuration system" emoji="📖">
For more details on spaCy's **configuration system** and how to use it to
customize your pipeline components, component models, training settings and
hyperparameters, see the [training config](/usage/training#config) usage guide.
</Infobox>
### Trainable components {#training-components}
spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
components that have their own model instance, make predictions over `Doc`
objects and can be updated using [`spacy train`](/api/cli#train). This lets you
plug fully custom machine learning components into your pipeline that can be
configured via a single training config.
> #### config.cfg (excerpt)
>
> ```ini
> [components.my_component]
> factory = "my_component"
>
> [components.my_component.model]
> @architectures = "my_model.v1"
> width = 128
> ```
![Illustration of Pipe methods](../images/trainable_component.svg)
<Infobox title="Custom trainable components" emoji="📖">
To learn more about how to implement your own **model architectures** and use
them to power custom **trainable components**, see the usage guides on the
[trainable component API](/usage/processing-pipelines#trainable-components) and
implementing [layers and architectures](/usage/layers-architectures#components)
for trainable components.
</Infobox> </Infobox>

View File

@ -378,7 +378,7 @@ weights and [resume training](/api/language#resume_training).
If you don't want a component to be updated, you can **freeze** it by adding it If you don't want a component to be updated, you can **freeze** it by adding it
to the `frozen_components` list in the `[training]` block. Frozen components are to the `frozen_components` list in the `[training]` block. Frozen components are
**not updated** during training and are included in the final trained pipeline **not updated** during training and are included in the final trained pipeline
as-is. as-is. They are also excluded when calling [`nlp.initialize`](/api/language#initialize).
> #### Note on frozen components > #### Note on frozen components
> >

View File

@ -168,9 +168,13 @@ follow the same unified [`Model`](https://thinc.ai/docs/api-model) API and each
`Model` can also be used as a sublayer of a larger network, allowing you to `Model` can also be used as a sublayer of a larger network, allowing you to
freely combine implementations from different frameworks into a single model. freely combine implementations from different frameworks into a single model.
![Illustration of Pipe methods](../images/trainable_component.svg)
<Infobox title="Details & Documentation" emoji="📖" list> <Infobox title="Details & Documentation" emoji="📖" list>
- **Usage: ** [Layers and architectures](/usage/layers-architectures) - **Usage: ** [Layers and architectures](/usage/layers-architectures),
[Trainable component API](/usage/processing-pipelines#trainable-components),
[Trainable components and models](/usage/layers-architectures#components)
- **Thinc: ** - **Thinc: **
[Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks), [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks),
[`Model` API](https://thinc.ai/docs/api-model) [`Model` API](https://thinc.ai/docs/api-model)
@ -503,36 +507,27 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
- Pipeline package symlinks, the `link` command and shortcut names are now - Pipeline package symlinks, the `link` command and shortcut names are now
deprecated. There can be many [different trained pipelines](/models) and not deprecated. There can be many [different trained pipelines](/models) and not
just one "English model", so you should always use the full package name like just one "English model", so you should always use the full package name like
[`en_core_web_sm`](/models/en) explicitly. `en_core_web_sm` explicitly.
- A pipeline's [`meta.json`](/api/data-formats#meta) is now only used to provide - A pipeline's `meta.json` is now only used to provide meta information like the
meta information like the package name, author, license and labels. It's package name, author, license and labels. It's **not** used to construct the
**not** used to construct the processing pipeline anymore. This is all defined processing pipeline anymore. This is all defined in the
in the [`config.cfg`](/api/data-formats#config), which also includes all [`config.cfg`](/api/data-formats#config), which also includes all settings
settings used to train the pipeline. used to train the pipeline.
- The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now - The `train`, `pretrain` and `debug data` commands now only take a
only take a `config.cfg` file containing the full `config.cfg`.
[training config](/usage/training#config).
- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of - [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
the component factory instead of the component function. the component factory instead of the component function.
- **Custom pipeline components** now need to be decorated with the - **Custom pipeline components** now need to be decorated with the
[`@Language.component`](/api/language#component) or [`@Language.component`](/api/language#component) or
[`@Language.factory`](/api/language#factory) decorator. [`@Language.factory`](/api/language#factory) decorator.
- [`Language.update`](/api/language#update) now takes a batch of
[`Example`](/api/example) objects instead of raw texts and annotations, or
`Doc` and `GoldParse` objects.
- The `Language.disable_pipes` context manager has been replaced by
[`Language.select_pipes`](/api/language#select_pipes), which can explicitly
disable or enable components.
- The [`Language.update`](/api/language#update), - The [`Language.update`](/api/language#update),
[`Language.evaluate`](/api/language#evaluate) and [`Language.evaluate`](/api/language#evaluate) and
[`Pipe.update`](/api/pipe#update) methods now all take batches of [`Pipe.update`](/api/pipe#update) methods now all take batches of
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
raw text and a dictionary of annotations. raw text and a dictionary of annotations.
[`Language.initialize`](/api/language#initialize) and - The `begin_training` methods have been renamed to `initialize` and now take a
[`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a function that returns a sequence of `Example` objects to initialize the model
sequence of `Example` objects to initialize the model instead of a list of instead of a list of tuples.
tuples.
- The `begin_training` methods have been renamed to `initialize`.
- [`Matcher.add`](/api/matcher#add) and - [`Matcher.add`](/api/matcher#add) and
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
patterns as the second argument (instead of a variable number of arguments). patterns as the second argument (instead of a variable number of arguments).
@ -557,7 +552,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
| Removed | Replacement | | Removed | Replacement |
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | | `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) |
| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... | | `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... |
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) | | `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
| `GoldParse` | [`Example`](/api/example) | | `GoldParse` | [`Example`](/api/example) |