begin_training -> initialize

This commit is contained in:
Ines Montani 2020-09-28 21:35:09 +02:00
parent 046f655d86
commit ff9a63bfbd
57 changed files with 301 additions and 253 deletions

View File

@ -103,12 +103,12 @@ def debug_model(
with data_validation(False): with data_validation(False):
try: try:
train_corpus = dot_to_object(config, config["training"]["train_corpus"]) train_corpus = dot_to_object(config, config["training"]["train_corpus"])
nlp.begin_training(lambda: train_corpus(nlp)) nlp.initialize(lambda: train_corpus(nlp))
msg.info("Initialized the model with the training corpus.") msg.info("Initialized the model with the training corpus.")
except ValueError: except ValueError:
try: try:
_set_output_dim(nO=7, model=model) _set_output_dim(nO=7, model=model)
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X]) nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
msg.info("Initialized the model with dummy data.") msg.info("Initialized the model with dummy data.")
except Exception: except Exception:
msg.fail( msg.fail(

View File

@ -85,6 +85,7 @@ class Warnings:
"attribute or operator.") "attribute or operator.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
W090 = ("Could not locate any {format} files in path '{path}'.") W090 = ("Could not locate any {format} files in path '{path}'.")
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@ -306,7 +307,7 @@ class Errors:
"settings: {opts}") "settings: {opts}")
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}") E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
E109 = ("Component '{name}' could not be run. Did you forget to " E109 = ("Component '{name}' could not be run. Did you forget to "
"call begin_training()?") "call initialize()?")
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}") E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
E111 = ("Pickling a token is not supported, because tokens are only views " E111 = ("Pickling a token is not supported, because tokens are only views "
"of the parent Doc and can't exist on their own. A pickled token " "of the parent Doc and can't exist on their own. A pickled token "
@ -376,7 +377,7 @@ class Errors:
"provided {found}.") "provided {found}.")
E143 = ("Labels for component '{name}' not initialized. This can be fixed " E143 = ("Labels for component '{name}' not initialized. This can be fixed "
"by calling add_label, or by providing a representative batch of " "by calling add_label, or by providing a representative batch of "
"examples to the component's begin_training method.") "examples to the component's initialize method.")
E145 = ("Error reading `{param}` from input file.") E145 = ("Error reading `{param}` from input file.")
E146 = ("Could not access `{path}`.") E146 = ("Could not access `{path}`.")
E147 = ("Unexpected error in the {method} functionality of the " E147 = ("Unexpected error in the {method} functionality of the "
@ -517,7 +518,7 @@ class Errors:
"but the provided argument {loc} points to a file.") "but the provided argument {loc} points to a file.")
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
"not seem to exist.") "not seem to exist.")
E930 = ("Received invalid get_examples callback in {name}.begin_training. " E930 = ("Received invalid get_examples callback in {name}.initialize. "
"Expected function that returns an iterable of Example objects but " "Expected function that returns an iterable of Example objects but "
"got: {obj}") "got: {obj}")
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component " E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "

View File

@ -1154,6 +1154,16 @@ class Language:
*, *,
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
device: int = -1, device: int = -1,
) -> Optimizer:
warnings.warn(Warnings.W089, DeprecationWarning)
return self.initialize(get_examples, sgd=sgd, device=device)
def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
sgd: Optional[Optimizer] = None,
device: int = -1,
) -> Optimizer: ) -> Optimizer:
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using data examples if available.
@ -1163,11 +1173,11 @@ class Language:
create_optimizer if it doesn't exist. create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer. RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/language#begin_training DOCS: https://nightly.spacy.io/api/language#initialize
""" """
if get_examples is None: if get_examples is None:
util.logger.debug( util.logger.debug(
"No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples" "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
) )
doc = Doc(self.vocab, words=["x", "y", "z"]) doc = Doc(self.vocab, words=["x", "y", "z"])
get_examples = lambda: [Example.from_dict(doc, {})] get_examples = lambda: [Example.from_dict(doc, {})]
@ -1179,7 +1189,7 @@ class Language:
for example in get_examples(): for example in get_examples():
if not isinstance(example, Example): if not isinstance(example, Example):
err = Errors.E978.format( err = Errors.E978.format(
name="Language.begin_training", types=type(example) name="Language.initialize", types=type(example)
) )
raise ValueError(err) raise ValueError(err)
else: else:
@ -1198,8 +1208,8 @@ class Language:
sgd = create_default_optimizer() sgd = create_default_optimizer()
self._optimizer = sgd self._optimizer = sgd
for name, proc in self.pipeline: for name, proc in self.pipeline:
if hasattr(proc, "begin_training"): if hasattr(proc, "initialize"):
proc.begin_training( proc.initialize(
get_examples, pipeline=self.pipeline, sgd=self._optimizer get_examples, pipeline=self.pipeline, sgd=self._optimizer
) )
self._link_components() self._link_components()

View File

@ -132,7 +132,7 @@ cdef class DependencyParser(Parser):
labeller.model.set_dim("nO", len(self.labels)) labeller.model.set_dim("nO", len(self.labels))
if labeller.model.has_ref("output_layer"): if labeller.model.has_ref("output_layer"):
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd) labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd)
@property @property
def labels(self): def labels(self):

View File

@ -140,7 +140,7 @@ class EntityLinker(Pipe):
if len(self.kb) == 0: if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name)) raise ValueError(Errors.E139.format(name=self.name))
def begin_training( def initialize(
self, self,
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
@ -159,7 +159,7 @@ class EntityLinker(Pipe):
create_optimizer if it doesn't exist. create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer. RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training DOCS: https://nightly.spacy.io/api/entitylinker#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
self._require_kb() self._require_kb()

View File

@ -129,7 +129,7 @@ class Morphologizer(Tagger):
self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
return 1 return 1
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
@ -142,7 +142,7 @@ class Morphologizer(Tagger):
create_optimizer if it doesn't exist. create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer. RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training DOCS: https://nightly.spacy.io/api/morphologizer#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
# First, fetch all labels from the data # First, fetch all labels from the data

View File

@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids): def set_annotations(self, docs, dep_ids):
pass pass
def begin_training(self, get_examples, pipeline=None, sgd=None): def initialize(self, get_examples, pipeline=None, sgd=None):
if not hasattr(get_examples, "__call__"): if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples)) err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
raise ValueError(err) raise ValueError(err)
@ -177,10 +177,10 @@ class ClozeMultitask(Pipe):
def set_annotations(self, docs, dep_ids): def set_annotations(self, docs, dep_ids):
pass pass
def begin_training(self, get_examples, pipeline=None, sgd=None): def initialize(self, get_examples, pipeline=None, sgd=None):
self.model.initialize() # TODO: fix initialization by defining X and Y self.model.initialize() # TODO: fix initialization by defining X and Y
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.begin_training(X) self.model.output_layer.initialize(X)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd

View File

@ -103,7 +103,7 @@ cdef class EntityRecognizer(Parser):
labeller.model.set_dim("nO", len(self.labels)) labeller.model.set_dim("nO", len(self.labels))
if labeller.model.has_ref("output_layer"): if labeller.model.has_ref("output_layer"):
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline) labeller.initialize(get_examples, pipeline=pipeline)
@property @property
def labels(self): def labels(self):

View File

@ -183,7 +183,7 @@ cdef class Pipe:
""" """
return util.create_default_optimizer() return util.create_default_optimizer()
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using data examples if available.
This method needs to be implemented by each Pipe component, This method needs to be implemented by each Pipe component,
ensuring the internal model (if available) is initialized properly ensuring the internal model (if available) is initialized properly
@ -198,7 +198,7 @@ cdef class Pipe:
create_optimizer if it doesn't exist. create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer. RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/pipe#begin_training DOCS: https://nightly.spacy.io/api/pipe#initialize
""" """
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))

View File

@ -58,7 +58,7 @@ class Sentencizer(Pipe):
else: else:
self.punct_chars = set(self.default_punct_chars) self.punct_chars = set(self.default_punct_chars)
def begin_training(self, get_examples, pipeline=None, sgd=None): def initialize(self, get_examples, pipeline=None, sgd=None):
pass pass
def __call__(self, doc): def __call__(self, doc):

View File

@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger):
raise ValueError("nan value when computing loss") raise ValueError("nan value when computing loss")
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
@ -137,7 +137,7 @@ class SentenceRecognizer(Tagger):
create_optimizer if it doesn't exist. create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer. RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
doc_sample = [] doc_sample = []

View File

@ -256,7 +256,7 @@ class Tagger(Pipe):
raise ValueError("nan value when computing loss") raise ValueError("nan value when computing loss")
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
@ -269,7 +269,7 @@ class Tagger(Pipe):
create_optimizer if it doesn't exist. create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer. RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/tagger#begin_training DOCS: https://nightly.spacy.io/api/tagger#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
doc_sample = [] doc_sample = []

View File

@ -334,7 +334,7 @@ class TextCategorizer(Pipe):
self.labels = tuple(list(self.labels) + [label]) self.labels = tuple(list(self.labels) + [label])
return 1 return 1
def begin_training( def initialize(
self, self,
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
@ -353,7 +353,7 @@ class TextCategorizer(Pipe):
create_optimizer if it doesn't exist. create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer. RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
subbatch = [] # Select a subbatch of examples to initialize the model subbatch = [] # Select a subbatch of examples to initialize the model

View File

@ -203,7 +203,7 @@ class Tok2Vec(Pipe):
def get_loss(self, examples, scores) -> None: def get_loss(self, examples, scores) -> None:
pass pass
def begin_training( def initialize(
self, self,
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
@ -222,7 +222,7 @@ class Tok2Vec(Pipe):
create_optimizer if it doesn't exist. create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer. RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training DOCS: https://nightly.spacy.io/api/tok2vec#initialize
""" """
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
doc_sample = [] doc_sample = []

View File

@ -405,7 +405,7 @@ cdef class Parser(Pipe):
def set_output(self, nO): def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO) self.model.attrs["resize_output"](self.model, nO)
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs):
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
self.cfg.update(kwargs) self.cfg.update(kwargs)
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})

View File

@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL} cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"] model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config) ner = EntityRecognizer(en_vocab, model, **config)
ner.begin_training(lambda: [_ner_example(ner)]) ner.initialize(lambda: [_ner_example(ner)])
ner(doc) ner(doc)
doc.ents = [("ANIMAL", 3, 4)] doc.ents = [("ANIMAL", 3, 4)]
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL} cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"] model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config) ner = EntityRecognizer(en_vocab, model, **config)
ner.begin_training(lambda: [_ner_example(ner)]) ner.initialize(lambda: [_ner_example(ner)])
ner(doc) ner(doc)
orig_iobs = [t.ent_iob_ for t in doc] orig_iobs = [t.ent_iob_ for t in doc]
doc.ents = list(doc.ents) doc.ents = list(doc.ents)

View File

@ -35,7 +35,7 @@ def test_init_parser(parser):
def _train_parser(parser): def _train_parser(parser):
fix_random_seed(1) fix_random_seed(1)
parser.add_label("left") parser.add_label("left")
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
sgd = Adam(0.001) sgd = Adam(0.001)
for i in range(5): for i in range(5):
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
ner1.add_label("C") ner1.add_label("C")
ner1.add_label("B") ner1.add_label("B")
ner1.add_label("A") ner1.add_label("A")
ner1.begin_training(lambda: [_ner_example(ner1)]) ner1.initialize(lambda: [_ner_example(ner1)])
ner2 = EntityRecognizer(Vocab(), model, **config) ner2 = EntityRecognizer(Vocab(), model, **config)
# the second model needs to be resized before we can call from_bytes # the second model needs to be resized before we can call from_bytes

View File

@ -202,7 +202,7 @@ def test_train_empty():
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
ner = nlp.add_pipe("ner", last=True) ner = nlp.add_pipe("ner", last=True)
ner.add_label("PERSON") ner.add_label("PERSON")
nlp.begin_training() nlp.initialize()
for itn in range(2): for itn in range(2):
losses = {} losses = {}
batches = util.minibatch(train_examples, size=8) batches = util.minibatch(train_examples, size=8)
@ -213,7 +213,7 @@ def test_train_empty():
def test_overwrite_token(): def test_overwrite_token():
nlp = English() nlp = English()
nlp.add_pipe("ner") nlp.add_pipe("ner")
nlp.begin_training() nlp.initialize()
# The untrained NER will predict O for each token # The untrained NER will predict O for each token
doc = nlp("I live in New York") doc = nlp("I live in New York")
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"] assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
@ -235,7 +235,7 @@ def test_empty_ner():
nlp = English() nlp = English()
ner = nlp.add_pipe("ner") ner = nlp.add_pipe("ner")
ner.add_label("MY_LABEL") ner.add_label("MY_LABEL")
nlp.begin_training() nlp.initialize()
doc = nlp("John is watching the news about Croatia's elections") doc = nlp("John is watching the news about Croatia's elections")
# if this goes wrong, the initialization of the parser's upper layer is probably broken # if this goes wrong, the initialization of the parser's upper layer is probably broken
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"] result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
@ -254,7 +254,7 @@ def test_ruler_before_ner():
# 2: untrained NER - should set everything else to O # 2: untrained NER - should set everything else to O
untrained_ner = nlp.add_pipe("ner") untrained_ner = nlp.add_pipe("ner")
untrained_ner.add_label("MY_LABEL") untrained_ner.add_label("MY_LABEL")
nlp.begin_training() nlp.initialize()
doc = nlp("This is Antti Korhonen speaking in Finland") doc = nlp("This is Antti Korhonen speaking in Finland")
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
expected_types = ["THING", "", "", "", "", "", ""] expected_types = ["THING", "", "", "", "", "", ""]
@ -269,7 +269,7 @@ def test_ner_before_ruler():
# 1: untrained NER - should set everything to O # 1: untrained NER - should set everything to O
untrained_ner = nlp.add_pipe("ner", name="uner") untrained_ner = nlp.add_pipe("ner", name="uner")
untrained_ner.add_label("MY_LABEL") untrained_ner.add_label("MY_LABEL")
nlp.begin_training() nlp.initialize()
# 2 : Entity Ruler - should set "this" to B and keep everything else O # 2 : Entity Ruler - should set "this" to B and keep everything else O
patterns = [{"label": "THING", "pattern": "This"}] patterns = [{"label": "THING", "pattern": "This"}]
@ -290,7 +290,7 @@ def test_block_ner():
nlp.add_pipe("blocker", config={"start": 2, "end": 5}) nlp.add_pipe("blocker", config={"start": 2, "end": 5})
untrained_ner = nlp.add_pipe("ner") untrained_ner = nlp.add_pipe("ner")
untrained_ner.add_label("MY_LABEL") untrained_ner.add_label("MY_LABEL")
nlp.begin_training() nlp.initialize()
doc = nlp("This is Antti L Korhonen speaking in Finland") doc = nlp("This is Antti L Korhonen speaking in Finland")
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"] expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
expected_types = ["", "", "", "", "", "", "", ""] expected_types = ["", "", "", "", "", "", "", ""]
@ -307,7 +307,7 @@ def test_overfitting_IO():
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for ent in annotations.get("entities"): for ent in annotations.get("entities"):
ner.add_label(ent[2]) ner.add_label(ent[2])
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(50): for i in range(50):
losses = {} losses = {}
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
assert not len(nlp.vocab.lookups) assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner") nlp.add_pipe("ner")
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
nlp.begin_training() nlp.initialize()
assert "W033" in caplog.text assert "W033" in caplog.text
caplog.clear() caplog.clear()
nlp.vocab.lookups.add_table("lexeme_norm") nlp.vocab.lookups.add_table("lexeme_norm")
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
nlp.begin_training() nlp.initialize()
assert "W033" not in caplog.text assert "W033" not in caplog.text
@ -358,5 +358,5 @@ class BlockerComponent1:
self.name = name self.name = name
def __call__(self, doc): def __call__(self, doc):
doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified") doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
return doc return doc

View File

@ -191,7 +191,7 @@ def test_overfitting_IO():
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for dep in annotations.get("deps", []): for dep in annotations.get("deps", []):
parser.add_label(dep) parser.add_label(dep)
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(100): for i in range(100):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)

View File

@ -34,7 +34,7 @@ def parser(vocab):
parser.cfg["hidden_width"] = 32 parser.cfg["hidden_width"] = 32
# parser.add_label('right') # parser.add_label('right')
parser.add_label("left") parser.add_label("left")
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
sgd = Adam(0.001) sgd = Adam(0.001)
for i in range(10): for i in range(10):

View File

@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
"""Test that the EL can't train without defining a KB""" """Test that the EL can't train without defining a KB"""
entity_linker = nlp.add_pipe("entity_linker", config={}) entity_linker = nlp.add_pipe("entity_linker", config={})
with pytest.raises(ValueError): with pytest.raises(ValueError):
entity_linker.begin_training(lambda: []) entity_linker.initialize(lambda: [])
def test_kb_empty(nlp): def test_kb_empty(nlp):
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
entity_linker = nlp.add_pipe("entity_linker", config=config) entity_linker = nlp.add_pipe("entity_linker", config=config)
assert len(entity_linker.kb) == 0 assert len(entity_linker.kb) == 0
with pytest.raises(ValueError): with pytest.raises(ValueError):
entity_linker.begin_training(lambda: []) entity_linker.initialize(lambda: [])
def test_kb_serialize(nlp): def test_kb_serialize(nlp):
@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp):
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False} el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True) entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
nlp.begin_training() nlp.initialize()
assert entity_linker.model.get_dim("nO") == vector_length assert entity_linker.model.get_dim("nO") == vector_length
# test whether the entity links are preserved by the `as_doc()` function # test whether the entity links are preserved by the `as_doc()` function
@ -463,7 +463,7 @@ def test_overfitting_IO():
) )
# train the NEL pipe # train the NEL pipe
optimizer = nlp.begin_training(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert entity_linker.model.get_dim("nO") == vector_length assert entity_linker.model.get_dim("nO") == vector_length
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length

View File

@ -33,7 +33,7 @@ def test_no_label():
nlp = Language() nlp = Language()
nlp.add_pipe("morphologizer") nlp.add_pipe("morphologizer")
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training() nlp.initialize()
def test_implicit_label(): def test_implicit_label():
@ -42,7 +42,7 @@ def test_implicit_label():
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
def test_no_resize(): def test_no_resize():
@ -50,13 +50,13 @@ def test_no_resize():
morphologizer = nlp.add_pipe("morphologizer") morphologizer = nlp.add_pipe("morphologizer")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
nlp.begin_training() nlp.initialize()
# this throws an error because the morphologizer can't be resized after initialization # this throws an error because the morphologizer can't be resized after initialization
with pytest.raises(ValueError): with pytest.raises(ValueError):
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
def test_begin_training_examples(): def test_initialize_examples():
nlp = Language() nlp = Language()
morphologizer = nlp.add_pipe("morphologizer") morphologizer = nlp.add_pipe("morphologizer")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
@ -64,12 +64,12 @@ def test_begin_training_examples():
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine # you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training() nlp.initialize()
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError): with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None) nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples) nlp.initialize(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
@ -79,7 +79,7 @@ def test_overfitting_IO():
train_examples = [] train_examples = []
for inst in TRAIN_DATA: for inst in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
optimizer = nlp.begin_training(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(50): for i in range(50):
losses = {} losses = {}

View File

@ -31,19 +31,19 @@ TRAIN_DATA = [
] ]
def test_begin_training_examples(): def test_initialize_examples():
nlp = Language() nlp = Language()
nlp.add_pipe("senter") nlp.add_pipe("senter")
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine # you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training() nlp.initialize()
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError): with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None) nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples) nlp.initialize(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
@ -58,7 +58,7 @@ def test_overfitting_IO():
train_examples[1].reference[11].is_sent_start = False train_examples[1].reference[11].is_sent_start = False
nlp.add_pipe("senter") nlp.add_pipe("senter")
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(200): for i in range(200):
losses = {} losses = {}

View File

@ -15,14 +15,14 @@ def test_label_types():
tagger.add_label(9) tagger.add_label(9)
def test_tagger_begin_training_tag_map(): def test_tagger_initialize_tag_map():
"""Test that Tagger.begin_training() without gold tuples does not clobber """Test that Tagger.initialize() without gold tuples does not clobber
the tag map.""" the tag map."""
nlp = Language() nlp = Language()
tagger = nlp.add_pipe("tagger") tagger = nlp.add_pipe("tagger")
orig_tag_count = len(tagger.labels) orig_tag_count = len(tagger.labels)
tagger.add_label("A") tagger.add_label("A")
nlp.begin_training() nlp.initialize()
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels) assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
@ -38,7 +38,7 @@ def test_no_label():
nlp = Language() nlp = Language()
nlp.add_pipe("tagger") nlp.add_pipe("tagger")
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training() nlp.initialize()
def test_no_resize(): def test_no_resize():
@ -47,7 +47,7 @@ def test_no_resize():
tagger.add_label("N") tagger.add_label("N")
tagger.add_label("V") tagger.add_label("V")
assert tagger.labels == ("N", "V") assert tagger.labels == ("N", "V")
nlp.begin_training() nlp.initialize()
assert tagger.model.get_dim("nO") == 2 assert tagger.model.get_dim("nO") == 2
# this throws an error because the tagger can't be resized after initialization # this throws an error because the tagger can't be resized after initialization
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -60,10 +60,10 @@ def test_implicit_label():
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
def test_begin_training_examples(): def test_initialize_examples():
nlp = Language() nlp = Language()
tagger = nlp.add_pipe("tagger") tagger = nlp.add_pipe("tagger")
train_examples = [] train_examples = []
@ -72,16 +72,16 @@ def test_begin_training_examples():
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine # you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training() nlp.initialize()
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError): with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None) nlp.initialize(get_examples=lambda: None)
with pytest.raises(TypeError): with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: train_examples[0]) nlp.initialize(get_examples=lambda: train_examples[0])
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(get_examples=lambda: []) nlp.initialize(get_examples=lambda: [])
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples) nlp.initialize(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
@ -91,7 +91,7 @@ def test_overfitting_IO():
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
optimizer = nlp.begin_training(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert tagger.model.get_dim("nO") == len(TAGS) assert tagger.model.get_dim("nO") == len(TAGS)
for i in range(50): for i in range(50):
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
nlp = English() nlp = English()
nlp.add_pipe("tagger") nlp.add_pipe("tagger")
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training() nlp.initialize()

View File

@ -26,7 +26,7 @@ def test_simple_train():
nlp = Language() nlp = Language()
textcat = nlp.add_pipe("textcat") textcat = nlp.add_pipe("textcat")
textcat.add_label("answer") textcat.add_label("answer")
nlp.begin_training() nlp.initialize()
for i in range(5): for i in range(5):
for text, answer in [ for text, answer in [
("aaaa", 1.0), ("aaaa", 1.0),
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
textcat = TextCategorizer(nlp.vocab, width=8) textcat = TextCategorizer(nlp.vocab, width=8)
for letter in letters: for letter in letters:
textcat.add_label(letter) textcat.add_label(letter)
optimizer = textcat.begin_training(lambda: []) optimizer = textcat.initialize(lambda: [])
for i in range(30): for i in range(30):
losses = {} losses = {}
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs] examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
@ -86,7 +86,7 @@ def test_no_label():
nlp = Language() nlp = Language()
nlp.add_pipe("textcat") nlp.add_pipe("textcat")
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training() nlp.initialize()
def test_implicit_label(): def test_implicit_label():
@ -95,7 +95,7 @@ def test_implicit_label():
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
def test_no_resize(): def test_no_resize():
@ -103,14 +103,14 @@ def test_no_resize():
textcat = nlp.add_pipe("textcat") textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE") textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE") textcat.add_label("NEGATIVE")
nlp.begin_training() nlp.initialize()
assert textcat.model.get_dim("nO") == 2 assert textcat.model.get_dim("nO") == 2
# this throws an error because the textcat can't be resized after initialization # this throws an error because the textcat can't be resized after initialization
with pytest.raises(ValueError): with pytest.raises(ValueError):
textcat.add_label("NEUTRAL") textcat.add_label("NEUTRAL")
def test_begin_training_examples(): def test_initialize_examples():
nlp = Language() nlp = Language()
textcat = nlp.add_pipe("textcat") textcat = nlp.add_pipe("textcat")
train_examples = [] train_examples = []
@ -119,12 +119,12 @@ def test_begin_training_examples():
for label, value in annotations.get("cats").items(): for label, value in annotations.get("cats").items():
textcat.add_label(label) textcat.add_label(label)
# you shouldn't really call this more than once, but for testing it should be fine # you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training() nlp.initialize()
nlp.begin_training(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError): with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None) nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples) nlp.initialize(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
@ -139,7 +139,7 @@ def test_overfitting_IO():
train_examples = [] train_examples = []
for text, annotations in TRAIN_DATA: for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
optimizer = nlp.begin_training(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert textcat.model.get_dim("nO") == 2 assert textcat.model.get_dim("nO") == 2
for i in range(50): for i in range(50):
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for label, value in annotations.get("cats").items(): for label, value in annotations.get("cats").items():
textcat.add_label(label) textcat.add_label(label)
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(5): for i in range(5):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)

View File

@ -88,7 +88,7 @@ def test_init_tok2vec():
nlp = English() nlp = English()
tok2vec = nlp.add_pipe("tok2vec") tok2vec = nlp.add_pipe("tok2vec")
assert tok2vec.listeners == [] assert tok2vec.listeners == []
nlp.begin_training() nlp.initialize()
assert tok2vec.model.get_dim("nO") assert tok2vec.model.get_dim("nO")
@ -154,7 +154,7 @@ def test_tok2vec_listener():
# Check that the Tok2Vec component finds it listeners # Check that the Tok2Vec component finds it listeners
assert tok2vec.listeners == [] assert tok2vec.listeners == []
optimizer = nlp.begin_training(lambda: train_examples) optimizer = nlp.initialize(lambda: train_examples)
assert tok2vec.listeners == [tagger_tok2vec] assert tok2vec.listeners == [tagger_tok2vec]
for i in range(5): for i in range(5):

View File

@ -428,7 +428,7 @@ def test_issue999():
for _, offsets in TRAIN_DATA: for _, offsets in TRAIN_DATA:
for start, end, label in offsets: for start, end, label in offsets:
ner.add_label(label) ner.add_label(label)
nlp.begin_training() nlp.initialize()
for itn in range(20): for itn in range(20):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
for raw_text, entity_offsets in TRAIN_DATA: for raw_text, entity_offsets in TRAIN_DATA:

View File

@ -250,7 +250,7 @@ def test_issue1915():
ner = nlp.add_pipe("ner") ner = nlp.add_pipe("ner")
ner.add_label("answer") ner.add_label("answer")
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.begin_training(**cfg) nlp.initialize(**cfg)
def test_issue1945(): def test_issue1945():

View File

@ -30,7 +30,7 @@ def test_issue2179():
nlp = Italian() nlp = Italian()
ner = nlp.add_pipe("ner") ner = nlp.add_pipe("ner")
ner.add_label("CITIZENSHIP") ner.add_label("CITIZENSHIP")
nlp.begin_training() nlp.initialize()
nlp2 = Italian() nlp2 = Italian()
nlp2.add_pipe("ner") nlp2.add_pipe("ner")
assert len(nlp2.get_pipe("ner").labels) == 0 assert len(nlp2.get_pipe("ner").labels) == 0

View File

@ -18,7 +18,7 @@ def test_issue2564():
nlp = Language() nlp = Language()
tagger = nlp.add_pipe("tagger") tagger = nlp.add_pipe("tagger")
tagger.add_label("A") tagger.add_label("A")
nlp.begin_training() nlp.initialize()
doc = nlp("hello world") doc = nlp("hello world")
assert doc.has_annotation("TAG") assert doc.has_annotation("TAG")
docs = nlp.pipe(["hello", "world"]) docs = nlp.pipe(["hello", "world"])
@ -149,7 +149,7 @@ def test_issue2800():
ner = nlp.add_pipe("ner") ner = nlp.add_pipe("ner")
for entity_type in list(entity_types): for entity_type in list(entity_types):
ner.add_label(entity_type) ner.add_label(entity_type)
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(20): for i in range(20):
losses = {} losses = {}
random.shuffle(train_data) random.shuffle(train_data)

View File

@ -92,7 +92,7 @@ def test_issue3209():
nlp = English() nlp = English()
ner = nlp.add_pipe("ner") ner = nlp.add_pipe("ner")
ner.add_label("ANIMAL") ner.add_label("ANIMAL")
nlp.begin_training() nlp.initialize()
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
assert ner.move_names == move_names assert ner.move_names == move_names
nlp2 = English() nlp2 = English()
@ -239,7 +239,7 @@ def test_issue3456():
nlp = English() nlp = English()
tagger = nlp.add_pipe("tagger") tagger = nlp.add_pipe("tagger")
tagger.add_label("A") tagger.add_label("A")
nlp.begin_training() nlp.initialize()
list(nlp.pipe(["hi", ""])) list(nlp.pipe(["hi", ""]))

View File

@ -223,7 +223,7 @@ def test_issue3611():
textcat.add_label(label) textcat.add_label(label)
# training the network # training the network
with nlp.select_pipes(enable="textcat"): with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(3): for i in range(3):
losses = {} losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -268,7 +268,7 @@ def test_issue3830_no_subtok():
parser = DependencyParser(Vocab(), model, **config) parser = DependencyParser(Vocab(), model, **config)
parser.add_label("nsubj") parser.add_label("nsubj")
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
parser.begin_training(lambda: [_parser_example(parser)]) parser.initialize(lambda: [_parser_example(parser)])
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
@ -283,7 +283,7 @@ def test_issue3830_with_subtok():
parser = DependencyParser(Vocab(), model, **config) parser = DependencyParser(Vocab(), model, **config)
parser.add_label("nsubj") parser.add_label("nsubj")
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
parser.begin_training(lambda: [_parser_example(parser)]) parser.initialize(lambda: [_parser_example(parser)])
assert "subtok" in parser.labels assert "subtok" in parser.labels
@ -342,7 +342,7 @@ def test_issue3880():
nlp.add_pipe("parser").add_label("dep") nlp.add_pipe("parser").add_label("dep")
nlp.add_pipe("ner").add_label("PERSON") nlp.add_pipe("ner").add_label("PERSON")
nlp.add_pipe("tagger").add_label("NN") nlp.add_pipe("tagger").add_label("NN")
nlp.begin_training() nlp.initialize()
for doc in nlp.pipe(texts): for doc in nlp.pipe(texts):
pass pass

View File

@ -66,7 +66,7 @@ def test_issue4030():
textcat.add_label(label) textcat.add_label(label)
# training the network # training the network
with nlp.select_pipes(enable="textcat"): with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(3): for i in range(3):
losses = {} losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -87,7 +87,7 @@ def test_issue4042():
# add ner pipe # add ner pipe
ner = nlp.add_pipe("ner") ner = nlp.add_pipe("ner")
ner.add_label("SOME_LABEL") ner.add_label("SOME_LABEL")
nlp.begin_training() nlp.initialize()
# Add entity ruler # Add entity ruler
patterns = [ patterns = [
{"label": "MY_ORG", "pattern": "Apple"}, {"label": "MY_ORG", "pattern": "Apple"},
@ -118,7 +118,7 @@ def test_issue4042_bug2():
# add ner pipe # add ner pipe
ner1 = nlp1.add_pipe("ner") ner1 = nlp1.add_pipe("ner")
ner1.add_label("SOME_LABEL") ner1.add_label("SOME_LABEL")
nlp1.begin_training() nlp1.initialize()
# add a new label to the doc # add a new label to the doc
doc1 = nlp1("What do you think about Apple ?") doc1 = nlp1("What do you think about Apple ?")
assert len(ner1.labels) == 1 assert len(ner1.labels) == 1
@ -244,7 +244,7 @@ def test_issue4267():
nlp = English() nlp = English()
ner = nlp.add_pipe("ner") ner = nlp.add_pipe("ner")
ner.add_label("PEOPLE") ner.add_label("PEOPLE")
nlp.begin_training() nlp.initialize()
assert "ner" in nlp.pipe_names assert "ner" in nlp.pipe_names
# assert that we have correct IOB annotations # assert that we have correct IOB annotations
doc1 = nlp("hi") doc1 = nlp("hi")
@ -299,7 +299,7 @@ def test_issue4313():
config = {} config = {}
ner = nlp.create_pipe("ner", config=config) ner = nlp.create_pipe("ner", config=config)
ner.add_label("SOME_LABEL") ner.add_label("SOME_LABEL")
ner.begin_training(lambda: []) ner.initialize(lambda: [])
# add a new label to the doc # add a new label to the doc
doc = nlp("What do you think about Apple ?") doc = nlp("What do you think about Apple ?")
assert len(ner.labels) == 1 assert len(ner.labels) == 1
@ -327,7 +327,7 @@ def test_issue4348():
TRAIN_DATA = [example, example] TRAIN_DATA = [example, example]
tagger = nlp.add_pipe("tagger") tagger = nlp.add_pipe("tagger")
tagger.add_label("A") tagger.add_label("A")
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(5): for i in range(5):
losses = {} losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))

View File

@ -180,7 +180,7 @@ def test_issue4725_2():
vocab.set_vector("dog", data[1]) vocab.set_vector("dog", data[1])
nlp = English(vocab=vocab) nlp = English(vocab=vocab)
nlp.add_pipe("ner") nlp.add_pipe("ner")
nlp.begin_training() nlp.initialize()
docs = ["Kurt is in London."] * 10 docs = ["Kurt is in London."] * 10
for _ in nlp.pipe(docs, batch_size=2, n_process=2): for _ in nlp.pipe(docs, batch_size=2, n_process=2):
pass pass

View File

@ -64,7 +64,7 @@ def tagger():
# 1. no model leads to error in serialization, # 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization # 2. the affected line is the one for model serialization
tagger.add_label("A") tagger.add_label("A")
nlp.begin_training() nlp.initialize()
return tagger return tagger
@ -85,7 +85,7 @@ def entity_linker():
# need to add model for two reasons: # need to add model for two reasons:
# 1. no model leads to error in serialization, # 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization # 2. the affected line is the one for model serialization
nlp.begin_training() nlp.initialize()
return entity_linker return entity_linker

View File

@ -25,7 +25,7 @@ def test_issue5551():
pipe = nlp.add_pipe(component, config=pipe_cfg, last=True) pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
for label in set(example[1]["cats"]): for label in set(example[1]["cats"]):
pipe.add_label(label) pipe.add_label(label)
nlp.begin_training() nlp.initialize()
# Store the result of each iteration # Store the result of each iteration
result = pipe.model.predict([nlp.make_doc(example[0])]) result = pipe.model.predict([nlp.make_doc(example[0])])

View File

@ -152,7 +152,7 @@ def test_serialize_nlp():
nlp_config = Config().from_str(nlp_config_string) nlp_config = Config().from_str(nlp_config_string)
nlp = load_model_from_config(nlp_config, auto_fill=True) nlp = load_model_from_config(nlp_config, auto_fill=True)
nlp.get_pipe("tagger").add_label("A") nlp.get_pipe("tagger").add_label("A")
nlp.begin_training() nlp.initialize()
assert "tok2vec" in nlp.pipe_names assert "tok2vec" in nlp.pipe_names
assert "tagger" in nlp.pipe_names assert "tagger" in nlp.pipe_names
assert "parser" not in nlp.pipe_names assert "parser" not in nlp.pipe_names
@ -173,7 +173,7 @@ def test_serialize_custom_nlp():
parser_cfg = dict() parser_cfg = dict()
parser_cfg["model"] = {"@architectures": "my_test_parser"} parser_cfg["model"] = {"@architectures": "my_test_parser"}
nlp.add_pipe("parser", config=parser_cfg) nlp.add_pipe("parser", config=parser_cfg)
nlp.begin_training() nlp.initialize()
with make_tempdir() as d: with make_tempdir() as d:
nlp.to_disk(d) nlp.to_disk(d)
@ -191,7 +191,7 @@ def test_serialize_parser():
model_config = Config().from_str(parser_config_string) model_config = Config().from_str(parser_config_string)
parser = nlp.add_pipe("parser", config=model_config) parser = nlp.add_pipe("parser", config=model_config)
parser.add_label("nsubj") parser.add_label("nsubj")
nlp.begin_training() nlp.initialize()
with make_tempdir() as d: with make_tempdir() as d:
nlp.to_disk(d) nlp.to_disk(d)

View File

@ -18,7 +18,7 @@ def nlp():
textcat = nlp.add_pipe("textcat") textcat = nlp.add_pipe("textcat")
for label in ("POSITIVE", "NEGATIVE"): for label in ("POSITIVE", "NEGATIVE"):
textcat.add_label(label) textcat.add_label(label)
nlp.begin_training() nlp.initialize()
return nlp return nlp

View File

@ -47,7 +47,7 @@ def test_readers():
) )
optimizer = T["optimizer"] optimizer = T["optimizer"]
# simulate a training loop # simulate a training loop
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
for example in train_corpus(nlp): for example in train_corpus(nlp):
nlp.update([example], sgd=optimizer) nlp.update([example], sgd=optimizer)
scores = nlp.evaluate(list(dev_corpus(nlp))) scores = nlp.evaluate(list(dev_corpus(nlp)))
@ -99,7 +99,7 @@ def test_cat_readers(reader, additional_config):
) )
optimizer = T["optimizer"] optimizer = T["optimizer"]
# simulate a training loop # simulate a training loop
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
for example in train_corpus(nlp): for example in train_corpus(nlp):
assert example.y.cats assert example.y.cats
# this shouldn't fail if each training example has at least one positive label # this shouldn't fail if each training example has at least one positive label

View File

@ -600,7 +600,7 @@ def _train_tuples(train_data):
train_examples = [] train_examples = []
for t in train_data: for t in train_data:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for i in range(5): for i in range(5):
losses = {} losses = {}
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))

View File

@ -49,9 +49,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
msg.info(f"Resuming training for: {resume_components}") msg.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer) nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]): with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
msg.good(f"Initialized pipeline components") msg.good(f"Initialized pipeline components")
# Verify the config after calling 'begin_training' to ensure labels # Verify the config after calling 'initialize' to ensure labels
# are properly initialized # are properly initialized
verify_config(nlp) verify_config(nlp)
if "pretraining" in config and config["pretraining"]: if "pretraining" in config and config["pretraining"]:

View File

@ -517,18 +517,18 @@ specific data and challenge.
Stacked ensemble of a bag-of-words model and a neural network model. The neural Stacked ensemble of a bag-of-words model and a neural network model. The neural
network has an internal CNN Tok2Vec layer and uses attention. network has an internal CNN Tok2Vec layer and uses attention.
| Name | Description | | Name | Description |
| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ | | `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
| `width` | Output dimension of the feature encoding step. ~~int~~ | | `width` | Output dimension of the feature encoding step. ~~int~~ |
| `embed_size` | Input dimension of the feature encoding step. ~~int~~ | | `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
| `conv_depth` | Depth of the tok2vec layer. ~~int~~ | | `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ | | `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
| `dropout` | The dropout rate. ~~float~~ | | `dropout` | The dropout rate. ~~float~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatCNN.v1 {#TextCatCNN} ### spacy.TextCatCNN.v1 {#TextCatCNN}
@ -555,12 +555,12 @@ A neural network model where token vectors are calculated using a CNN. The
vectors are mean pooled and used as features in a feed-forward network. This vectors are mean pooled and used as features in a feed-forward network. This
architecture is usually less accurate than the ensemble, but runs faster. architecture is usually less accurate than the ensemble, but runs faster.
| Name | Description | | Name | Description |
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | | `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatBOW.v1 {#TextCatBOW} ### spacy.TextCatBOW.v1 {#TextCatBOW}
@ -578,13 +578,13 @@ architecture is usually less accurate than the ensemble, but runs faster.
An ngram "bag-of-words" model. This architecture should run much faster than the An ngram "bag-of-words" model. This architecture should run much faster than the
others, but may not be as accurate, especially if texts are short. others, but may not be as accurate, especially if texts are short.
| Name | Description | | Name | Description |
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ | | `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
@ -629,11 +629,11 @@ into the "real world". This requires 3 main components:
The `EntityLinker` model architecture is a Thinc `Model` with a The `EntityLinker` model architecture is a Thinc `Model` with a
[`Linear`](https://thinc.ai/api-layers#linear) output layer. [`Linear`](https://thinc.ai/api-layers#linear) output layer.
| Name | Description | | Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | | `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.EmptyKB.v1 {#EmptyKB} ### spacy.EmptyKB.v1 {#EmptyKB}

View File

@ -140,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## DependencyParser.begin_training {#begin_training tag="method"} ## DependencyParser.initialize {#initialize tag="method"}
Initialize the component for training and return an Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -151,11 +151,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
> parser = nlp.add_pipe("parser") > parser = nlp.add_pipe("parser")
> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Description | | Name | Description |
@ -210,7 +216,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
> >
> ```python > ```python
> parser = nlp.add_pipe("parser") > parser = nlp.add_pipe("parser")
> optimizer = nlp.begin_training() > optimizer = nlp.initialize()
> losses = parser.update(examples, sgd=optimizer) > losses = parser.update(examples, sgd=optimizer)
> ``` > ```
@ -294,11 +300,10 @@ context, the original parameters are restored.
## DependencyParser.add_label {#add_label tag="method"} ## DependencyParser.add_label {#add_label tag="method"}
Add a new label to the pipe. Note that you don't have to call this method if you Add a new label to the pipe. Note that you don't have to call this method if you
provide a **representative data sample** to the provide a **representative data sample** to the [`initialize`](#initialize)
[`begin_training`](#begin_training) method. In this case, all labels found in method. In this case, all labels found in the sample will be automatically added
the sample will be automatically added to the model, and the output dimension to the model, and the output dimension will be
will be [inferred](/usage/layers-architectures#thinc-shape-inference) [inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
automatically.
> #### Example > #### Example
> >

View File

@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityLinker.begin_training {#begin_training tag="method"} ## EntityLinker.initialize {#initialize tag="method"}
Initialize the component for training and return an Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -150,11 +150,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
> entity_linker = nlp.add_pipe("entity_linker", last=True) > entity_linker = nlp.add_pipe("entity_linker", last=True)
> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Description | | Name | Description |
@ -211,7 +217,7 @@ pipe's entity linking model and context encoder. Delegates to
> >
> ```python > ```python
> entity_linker = nlp.add_pipe("entity_linker") > entity_linker = nlp.add_pipe("entity_linker")
> optimizer = nlp.begin_training() > optimizer = nlp.initialize()
> losses = entity_linker.update(examples, sgd=optimizer) > losses = entity_linker.update(examples, sgd=optimizer)
> ``` > ```

View File

@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityRecognizer.begin_training {#begin_training tag="method"} ## EntityRecognizer.initialize {#initialize tag="method"}
Initialize the component for training and return an Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -140,11 +140,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
> ner = nlp.add_pipe("ner") > ner = nlp.add_pipe("ner")
> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Description | | Name | Description |
@ -199,7 +205,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
> >
> ```python > ```python
> ner = nlp.add_pipe("ner") > ner = nlp.add_pipe("ner")
> optimizer = nlp.begin_training() > optimizer = nlp.initialize()
> losses = ner.update(examples, sgd=optimizer) > losses = ner.update(examples, sgd=optimizer)
> ``` > ```
@ -282,11 +288,10 @@ context, the original parameters are restored.
## EntityRecognizer.add_label {#add_label tag="method"} ## EntityRecognizer.add_label {#add_label tag="method"}
Add a new label to the pipe. Note that you don't have to call this method if you Add a new label to the pipe. Note that you don't have to call this method if you
provide a **representative data sample** to the provide a **representative data sample** to the [`initialize`](#initialize)
[`begin_training`](#begin_training) method. In this case, all labels found in method. In this case, all labels found in the sample will be automatically added
the sample will be automatically added to the model, and the output dimension to the model, and the output dimension will be
will be [inferred](/usage/layers-architectures#thinc-shape-inference) [inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
automatically.
> #### Example > #### Example
> >

View File

@ -201,30 +201,31 @@ more efficient than processing texts one-by-one.
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ | | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ | | **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
## Language.begin_training {#begin_training tag="method"} ## Language.initialize {#initialize tag="method"}
Initialize the pipeline for training and return an Initialize the pipeline for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
function that returns an iterable of [`Example`](/api/example) objects. The data function that returns an iterable of [`Example`](/api/example) objects. The data
examples can either be the full training data or a representative sample. They examples can either be the full training data or a representative sample. They
are used to **initialize the models** of trainable pipeline components and are are used to **initialize the models** of trainable pipeline components and are
passed each component's [`begin_training`](/api/pipe#begin_training) method, if passed each component's [`initialize`](/api/pipe#initialize) method, if
available. Initialization includes validating the network, available. Initialization includes validating the network,
[inferring missing shapes](/usage/layers-architectures#thinc-shape-inference) [inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
and setting up the label scheme based on the data. and setting up the label scheme based on the data.
If no `get_examples` function is provided when calling `nlp.begin_training`, the If no `get_examples` function is provided when calling `nlp.initialize`, the
pipeline components will be initialized with generic data. In this case, it is pipeline components will be initialized with generic data. In this case, it is
crucial that the output dimension of each component has already been defined crucial that the output dimension of each component has already been defined
either in the [config](/usage/training#config), or by calling either in the [config](/usage/training#config), or by calling
[`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
the tagger or textcat). the tagger or textcat).
<Infobox variant="warning" title="Changed in v3.0"> <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
The `Language.update` method now takes a **function** that is called with no This method was previously called `begin_training`. It now also takes a
arguments and returns a sequence of [`Example`](/api/example) objects instead of **function** that is called with no arguments and returns a sequence of
tuples of `Doc` and `GoldParse` objects. [`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
objects.
</Infobox> </Infobox>
@ -232,7 +233,7 @@ tuples of `Doc` and `GoldParse` objects.
> >
> ```python > ```python
> get_examples = lambda: examples > get_examples = lambda: examples
> optimizer = nlp.begin_training(get_examples) > optimizer = nlp.initialize(get_examples)
> ``` > ```
| Name | Description | | Name | Description |
@ -636,13 +637,13 @@ list, will be disabled. Under the hood, this method calls into
> >
> ```python > ```python
> with nlp.select_pipes(disable=["tagger", "parser"]): > with nlp.select_pipes(disable=["tagger", "parser"]):
> nlp.begin_training() > nlp.initialize()
> >
> with nlp.select_pipes(enable="ner"): > with nlp.select_pipes(enable="ner"):
> nlp.begin_training() > nlp.initialize()
> >
> disabled = nlp.select_pipes(disable=["tagger", "parser"]) > disabled = nlp.select_pipes(disable=["tagger", "parser"])
> nlp.begin_training() > nlp.initialize()
> disabled.restore() > disabled.restore()
> ``` > ```

View File

@ -117,7 +117,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Morphologizer.begin_training {#begin_training tag="method"} ## Morphologizer.initialize {#initialize tag="method"}
Initialize the component for training and return an Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -133,7 +133,7 @@ setting up the label scheme based on the data.
> ```python > ```python
> morphologizer = nlp.add_pipe("morphologizer") > morphologizer = nlp.add_pipe("morphologizer")
> nlp.pipeline.append(morphologizer) > nlp.pipeline.append(morphologizer)
> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Description | | Name | Description |
@ -189,7 +189,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
> >
> ```python > ```python
> morphologizer = nlp.add_pipe("morphologizer") > morphologizer = nlp.add_pipe("morphologizer")
> optimizer = nlp.begin_training() > optimizer = nlp.initialize()
> losses = morphologizer.update(examples, sgd=optimizer) > losses = morphologizer.update(examples, sgd=optimizer)
> ``` > ```
@ -259,12 +259,11 @@ context, the original parameters are restored.
Add a new label to the pipe. If the `Morphologizer` should set annotations for Add a new label to the pipe. If the `Morphologizer` should set annotations for
both `pos` and `morph`, the label should include the UPOS as the feature `POS`. both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
Raises an error if the output dimension is already set, or if the model has Raises an error if the output dimension is already set, or if the model has
already been fully [initialized](#begin_training). Note that you don't have to already been fully [initialized](#initialize). Note that you don't have to call
call this method if you provide a **representative data sample** to the this method if you provide a **representative data sample** to the
[`begin_training`](#begin_training) method. In this case, all labels found in [`initialize`](#initialize) method. In this case, all labels found in the sample
the sample will be automatically added to the model, and the output dimension will be automatically added to the model, and the output dimension will be
will be [inferred](/usage/layers-architectures#thinc-shape-inference) [inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
automatically.
> #### Example > #### Example
> >

View File

@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Pipe.begin_training {#begin_training tag="method"} ## Pipe.initialize {#initialize tag="method"}
Initialize the component for training and return an Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -109,11 +109,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
> pipe = nlp.add_pipe("your_custom_pipe") > pipe = nlp.add_pipe("your_custom_pipe")
> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Description | | Name | Description |
@ -180,7 +186,7 @@ predictions and gold-standard annotations, and update the component's model.
> >
> ```python > ```python
> pipe = nlp.add_pipe("your_custom_pipe") > pipe = nlp.add_pipe("your_custom_pipe")
> optimizer = nlp.begin_training() > optimizer = nlp.initialize()
> losses = pipe.update(examples, sgd=optimizer) > losses = pipe.update(examples, sgd=optimizer)
> ``` > ```
@ -296,9 +302,9 @@ context, the original parameters are restored.
Add a new label to the pipe, to be predicted by the model. The actual Add a new label to the pipe, to be predicted by the model. The actual
implementation depends on the specific component, but in general `add_label` implementation depends on the specific component, but in general `add_label`
shouldn't be called if the output dimension is already set, or if the model has shouldn't be called if the output dimension is already set, or if the model has
already been fully [initialized](#begin_training). If these conditions are already been fully [initialized](#initialize). If these conditions are violated,
violated, the function will raise an Error. The exception to this rule is when the function will raise an Error. The exception to this rule is when the
the component is [resizable](#is_resizable), in which case component is [resizable](#is_resizable), in which case
[`set_output`](#set_output) should be called to ensure that the model is [`set_output`](#set_output) should be called to ensure that the model is
properly resized. properly resized.
@ -314,9 +320,9 @@ This method needs to be overwritten with your own custom `add_label` method.
| **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ | | **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |
Note that in general, you don't have to call `pipe.add_label` if you provide a Note that in general, you don't have to call `pipe.add_label` if you provide a
representative data sample to the [`begin_training`](#begin_training) method. In representative data sample to the [`initialize`](#initialize) method. In this
this case, all labels found in the sample will be automatically added to the case, all labels found in the sample will be automatically added to the model,
model, and the output dimension will be and the output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. [inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
## Pipe.is_resizable {#is_resizable tag="method"} ## Pipe.is_resizable {#is_resizable tag="method"}

View File

@ -114,7 +114,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## SentenceRecognizer.begin_training {#begin_training tag="method"} ## SentenceRecognizer.initialize {#initialize tag="method"}
Initialize the component for training and return an Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -129,7 +129,7 @@ setting up the label scheme based on the data.
> >
> ```python > ```python
> senter = nlp.add_pipe("senter") > senter = nlp.add_pipe("senter")
> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Description | | Name | Description |
@ -185,7 +185,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
> >
> ```python > ```python
> senter = nlp.add_pipe("senter") > senter = nlp.add_pipe("senter")
> optimizer = nlp.begin_training() > optimizer = nlp.initialize()
> losses = senter.update(examples, sgd=optimizer) > losses = senter.update(examples, sgd=optimizer)
> ``` > ```

View File

@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Tagger.begin_training {#begin_training tag="method"} ## Tagger.initialize {#initialize tag="method"}
Initialize the component for training and return an Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -123,11 +123,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
> tagger = nlp.add_pipe("tagger") > tagger = nlp.add_pipe("tagger")
> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Description | | Name | Description |
@ -183,7 +189,7 @@ Delegates to [`predict`](/api/tagger#predict) and
> >
> ```python > ```python
> tagger = nlp.add_pipe("tagger") > tagger = nlp.add_pipe("tagger")
> optimizer = nlp.begin_training() > optimizer = nlp.initialize()
> losses = tagger.update(examples, sgd=optimizer) > losses = tagger.update(examples, sgd=optimizer)
> ``` > ```
@ -289,12 +295,12 @@ context, the original parameters are restored.
## Tagger.add_label {#add_label tag="method"} ## Tagger.add_label {#add_label tag="method"}
Add a new label to the pipe. Raises an error if the output dimension is already Add a new label to the pipe. Raises an error if the output dimension is already
set, or if the model has already been fully [initialized](#begin_training). Note set, or if the model has already been fully [initialized](#initialize). Note
that you don't have to call this method if you provide a **representative data that you don't have to call this method if you provide a **representative data
sample** to the [`begin_training`](#begin_training) method. In this case, all sample** to the [`initialize`](#initialize) method. In this case, all labels
labels found in the sample will be automatically added to the model, and the found in the sample will be automatically added to the model, and the output
output dimension will be dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. automatically.
> #### Example > #### Example
> >

View File

@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## TextCategorizer.begin_training {#begin_training tag="method"} ## TextCategorizer.initialize {#initialize tag="method"}
Initialize the component for training and return an Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -136,11 +136,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data. setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
> textcat = nlp.add_pipe("textcat") > textcat = nlp.add_pipe("textcat")
> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Description | | Name | Description |
@ -196,14 +202,14 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
> >
> ```python > ```python
> textcat = nlp.add_pipe("textcat") > textcat = nlp.add_pipe("textcat")
> optimizer = nlp.begin_training() > optimizer = nlp.initialize()
> losses = textcat.update(examples, sgd=optimizer) > losses = textcat.update(examples, sgd=optimizer)
> ``` > ```
| Name | Description | | Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
@ -227,7 +233,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ | | `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
@ -303,12 +309,12 @@ Modify the pipe's model, to use the given parameter values.
## TextCategorizer.add_label {#add_label tag="method"} ## TextCategorizer.add_label {#add_label tag="method"}
Add a new label to the pipe. Raises an error if the output dimension is already Add a new label to the pipe. Raises an error if the output dimension is already
set, or if the model has already been fully [initialized](#begin_training). Note set, or if the model has already been fully [initialized](#initialize). Note
that you don't have to call this method if you provide a **representative data that you don't have to call this method if you provide a **representative data
sample** to the [`begin_training`](#begin_training) method. In this case, all sample** to the [`initialize`](#initialize) method. In this case, all labels
labels found in the sample will be automatically added to the model, and the found in the sample will be automatically added to the model, and the output
output dimension will be dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically. automatically.
> #### Example > #### Example
> >

View File

@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Tok2Vec.begin_training {#begin_training tag="method"} ## Tok2Vec.initialize {#initialize tag="method"}
Initialize the component for training and return an Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -138,7 +138,7 @@ setting up the label scheme based on the data.
> >
> ```python > ```python
> tok2vec = nlp.add_pipe("tok2vec") > tok2vec = nlp.add_pipe("tok2vec")
> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Description | | Name | Description |
@ -193,7 +193,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
> >
> ```python > ```python
> tok2vec = nlp.add_pipe("tok2vec") > tok2vec = nlp.add_pipe("tok2vec")
> optimizer = nlp.begin_training() > optimizer = nlp.initialize()
> losses = tok2vec.update(examples, sgd=optimizer) > losses = tok2vec.update(examples, sgd=optimizer)
> ``` > ```

View File

@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Transformer.begin_training {#begin_training tag="method"} ## Transformer.initialize {#initialize tag="method"}
Initialize the component for training and return an Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -173,7 +173,7 @@ setting up the label scheme based on the data.
> >
> ```python > ```python
> trf = nlp.add_pipe("transformer") > trf = nlp.add_pipe("transformer")
> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Description | | Name | Description |
@ -241,7 +241,7 @@ and call the optimizer, while the others simply increment the gradients.
> >
> ```python > ```python
> trf = nlp.add_pipe("transformer") > trf = nlp.add_pipe("transformer")
> optimizer = nlp.begin_training() > optimizer = nlp.initialize()
> losses = trf.update(examples, sgd=optimizer) > losses = trf.update(examples, sgd=optimizer)
> ``` > ```

View File

@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure
that their internal models are **always initialized** with appropriate sample that their internal models are **always initialized** with appropriate sample
data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This ~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
functionality is triggered when functionality is triggered when [`nlp.initialize`](/api/language#initialize) is
[`nlp.begin_training`](/api/language#begin_training) is called. called.
### Dropout and normalization in Thinc {#thinc-dropout-norm} ### Dropout and normalization in Thinc {#thinc-dropout-norm}
@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}):
<!-- TODO: write trainable component section <!-- TODO: write trainable component section
- Interaction with `predict`, `get_loss` and `set_annotations` - Interaction with `predict`, `get_loss` and `set_annotations`
- Initialization life-cycle with `begin_training`, correlation with add_label - Initialization life-cycle with `initialize`, correlation with add_label
Example: relation extraction component (implemented as project template) Example: relation extraction component (implemented as project template)
Avoid duplication with usage/processing-pipelines#trainable-components ? Avoid duplication with usage/processing-pipelines#trainable-components ?
--> -->

View File

@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
customize how the model is updated from examples, how it's initialized, how the customize how the model is updated from examples, how it's initialized, how the
loss is calculated and to add evaluation scores to the training output. loss is calculated and to add evaluation scores to the training output.
| Name | Description | | Name | Description |
| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. | | [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. | | [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | | [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | | [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
<Infobox title="Custom trainable components and models" emoji="📖"> <Infobox title="Custom trainable components and models" emoji="📖">

View File

@ -1045,8 +1045,8 @@ of being dropped.
> - [`nlp`](/api/language): The `nlp` object with the pipeline components and > - [`nlp`](/api/language): The `nlp` object with the pipeline components and
> their models. > their models.
> - [`nlp.begin_training`](/api/language#begin_training): Start the training and > - [`nlp.initialize`](/api/language#initialize): Start the training and return
> return an optimizer to update the component model weights. > an optimizer to update the component model weights.
> - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds > - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
> state between updates. > state between updates.
> - [`nlp.update`](/api/language#update): Update component models with examples. > - [`nlp.update`](/api/language#update): Update component models with examples.
@ -1057,7 +1057,7 @@ of being dropped.
```python ```python
### Example training loop ### Example training loop
optimizer = nlp.begin_training() optimizer = nlp.initialize()
for itn in range(100): for itn in range(100):
random.shuffle(train_data) random.shuffle(train_data)
for raw_text, entity_offsets in train_data: for raw_text, entity_offsets in train_data:

View File

@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
[`Pipe.update`](/api/pipe#update) methods now all take batches of [`Pipe.update`](/api/pipe#update) methods now all take batches of
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
raw text and a dictionary of annotations. raw text and a dictionary of annotations.
[`Language.begin_training`](/api/language#begin_training) and [`Language.initialize`](/api/language#initialize) and
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that [`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
returns a sequence of `Example` objects to initialize the model instead of a sequence of `Example` objects to initialize the model instead of a list of
list of tuples. tuples.
- The `begin_training` methods have been renamed to `initialize`.
- [`Matcher.add`](/api/matcher#add) and - [`Matcher.add`](/api/matcher#add) and
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
patterns as the second argument (instead of a variable number of arguments). patterns as the second argument (instead of a variable number of arguments).
@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
| Removed | Replacement | | Removed | Replacement |
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | | `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... |
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) | | `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
| `GoldParse` | [`Example`](/api/example) | | `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) | | `GoldCorpus` | [`Corpus`](/api/corpus) |
@ -936,7 +938,7 @@ TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London.", {"entities": [(7, 13, "LOC")]}), ("I like London.", {"entities": [(7, 13, "LOC")]}),
] ]
nlp.begin_training() nlp.initialize()
for i in range(20): for i in range(20):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
for batch in minibatch(TRAIN_DATA): for batch in minibatch(TRAIN_DATA):
@ -946,17 +948,18 @@ for i in range(20):
nlp.update(examples) nlp.update(examples)
``` ```
[`Language.begin_training`](/api/language#begin_training) and `Language.begin_training` and `Pipe.begin_training` have been renamed to
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that [`Language.initialize`](/api/language#initialize) and
returns a sequence of `Example` objects to initialize the model instead of a [`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
list of tuples. The data examples are used to **initialize the models** of that returns a sequence of `Example` objects to initialize the model instead of
a list of tuples. The data examples are used to **initialize the models** of
trainable pipeline components, which includes validating the network, trainable pipeline components, which includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme. setting up the label scheme.
```diff ```diff
- nlp.begin_training(examples) - nlp.initialize(examples)
+ nlp.begin_training(lambda: examples) + nlp.initialize(lambda: examples)
``` ```
#### Packaging trained pipelines {#migrating-training-packaging} #### Packaging trained pipelines {#migrating-training-packaging}