mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
begin_training -> initialize
This commit is contained in:
parent
046f655d86
commit
ff9a63bfbd
|
@ -103,12 +103,12 @@ def debug_model(
|
||||||
with data_validation(False):
|
with data_validation(False):
|
||||||
try:
|
try:
|
||||||
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
|
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
|
||||||
nlp.begin_training(lambda: train_corpus(nlp))
|
nlp.initialize(lambda: train_corpus(nlp))
|
||||||
msg.info("Initialized the model with the training corpus.")
|
msg.info("Initialized the model with the training corpus.")
|
||||||
except ValueError:
|
except ValueError:
|
||||||
try:
|
try:
|
||||||
_set_output_dim(nO=7, model=model)
|
_set_output_dim(nO=7, model=model)
|
||||||
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
|
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
|
||||||
msg.info("Initialized the model with dummy data.")
|
msg.info("Initialized the model with dummy data.")
|
||||||
except Exception:
|
except Exception:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
|
|
@ -85,6 +85,7 @@ class Warnings:
|
||||||
"attribute or operator.")
|
"attribute or operator.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
|
||||||
W090 = ("Could not locate any {format} files in path '{path}'.")
|
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||||
|
@ -306,7 +307,7 @@ class Errors:
|
||||||
"settings: {opts}")
|
"settings: {opts}")
|
||||||
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||||
"call begin_training()?")
|
"call initialize()?")
|
||||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||||
E111 = ("Pickling a token is not supported, because tokens are only views "
|
E111 = ("Pickling a token is not supported, because tokens are only views "
|
||||||
"of the parent Doc and can't exist on their own. A pickled token "
|
"of the parent Doc and can't exist on their own. A pickled token "
|
||||||
|
@ -376,7 +377,7 @@ class Errors:
|
||||||
"provided {found}.")
|
"provided {found}.")
|
||||||
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
|
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
|
||||||
"by calling add_label, or by providing a representative batch of "
|
"by calling add_label, or by providing a representative batch of "
|
||||||
"examples to the component's begin_training method.")
|
"examples to the component's initialize method.")
|
||||||
E145 = ("Error reading `{param}` from input file.")
|
E145 = ("Error reading `{param}` from input file.")
|
||||||
E146 = ("Could not access `{path}`.")
|
E146 = ("Could not access `{path}`.")
|
||||||
E147 = ("Unexpected error in the {method} functionality of the "
|
E147 = ("Unexpected error in the {method} functionality of the "
|
||||||
|
@ -517,7 +518,7 @@ class Errors:
|
||||||
"but the provided argument {loc} points to a file.")
|
"but the provided argument {loc} points to a file.")
|
||||||
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
||||||
"not seem to exist.")
|
"not seem to exist.")
|
||||||
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
|
E930 = ("Received invalid get_examples callback in {name}.initialize. "
|
||||||
"Expected function that returns an iterable of Example objects but "
|
"Expected function that returns an iterable of Example objects but "
|
||||||
"got: {obj}")
|
"got: {obj}")
|
||||||
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
|
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
|
||||||
|
|
|
@ -1154,6 +1154,16 @@ class Language:
|
||||||
*,
|
*,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Optional[Optimizer] = None,
|
||||||
device: int = -1,
|
device: int = -1,
|
||||||
|
) -> Optimizer:
|
||||||
|
warnings.warn(Warnings.W089, DeprecationWarning)
|
||||||
|
return self.initialize(get_examples, sgd=sgd, device=device)
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||||
|
*,
|
||||||
|
sgd: Optional[Optimizer] = None,
|
||||||
|
device: int = -1,
|
||||||
) -> Optimizer:
|
) -> Optimizer:
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
|
|
||||||
|
@ -1163,11 +1173,11 @@ class Language:
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#begin_training
|
DOCS: https://nightly.spacy.io/api/language#initialize
|
||||||
"""
|
"""
|
||||||
if get_examples is None:
|
if get_examples is None:
|
||||||
util.logger.debug(
|
util.logger.debug(
|
||||||
"No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
|
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
|
||||||
)
|
)
|
||||||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||||||
get_examples = lambda: [Example.from_dict(doc, {})]
|
get_examples = lambda: [Example.from_dict(doc, {})]
|
||||||
|
@ -1179,7 +1189,7 @@ class Language:
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
if not isinstance(example, Example):
|
if not isinstance(example, Example):
|
||||||
err = Errors.E978.format(
|
err = Errors.E978.format(
|
||||||
name="Language.begin_training", types=type(example)
|
name="Language.initialize", types=type(example)
|
||||||
)
|
)
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
else:
|
else:
|
||||||
|
@ -1198,8 +1208,8 @@ class Language:
|
||||||
sgd = create_default_optimizer()
|
sgd = create_default_optimizer()
|
||||||
self._optimizer = sgd
|
self._optimizer = sgd
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "begin_training"):
|
if hasattr(proc, "initialize"):
|
||||||
proc.begin_training(
|
proc.initialize(
|
||||||
get_examples, pipeline=self.pipeline, sgd=self._optimizer
|
get_examples, pipeline=self.pipeline, sgd=self._optimizer
|
||||||
)
|
)
|
||||||
self._link_components()
|
self._link_components()
|
||||||
|
|
|
@ -132,7 +132,7 @@ cdef class DependencyParser(Parser):
|
||||||
labeller.model.set_dim("nO", len(self.labels))
|
labeller.model.set_dim("nO", len(self.labels))
|
||||||
if labeller.model.has_ref("output_layer"):
|
if labeller.model.has_ref("output_layer"):
|
||||||
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||||
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
|
labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
|
|
@ -140,7 +140,7 @@ class EntityLinker(Pipe):
|
||||||
if len(self.kb) == 0:
|
if len(self.kb) == 0:
|
||||||
raise ValueError(Errors.E139.format(name=self.name))
|
raise ValueError(Errors.E139.format(name=self.name))
|
||||||
|
|
||||||
def begin_training(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
|
@ -159,7 +159,7 @@ class EntityLinker(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
|
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
self._require_kb()
|
self._require_kb()
|
||||||
|
|
|
@ -129,7 +129,7 @@ class Morphologizer(Tagger):
|
||||||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, pipeline=None, sgd=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
|
@ -142,7 +142,7 @@ class Morphologizer(Tagger):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
|
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
# First, fetch all labels from the data
|
# First, fetch all labels from the data
|
||||||
|
|
|
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
def initialize(self, get_examples, pipeline=None, sgd=None):
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
|
@ -177,10 +177,10 @@ class ClozeMultitask(Pipe):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
def initialize(self, get_examples, pipeline=None, sgd=None):
|
||||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||||
self.model.output_layer.begin_training(X)
|
self.model.output_layer.initialize(X)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
return sgd
|
return sgd
|
||||||
|
|
|
@ -103,7 +103,7 @@ cdef class EntityRecognizer(Parser):
|
||||||
labeller.model.set_dim("nO", len(self.labels))
|
labeller.model.set_dim("nO", len(self.labels))
|
||||||
if labeller.model.has_ref("output_layer"):
|
if labeller.model.has_ref("output_layer"):
|
||||||
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||||
labeller.begin_training(get_examples, pipeline=pipeline)
|
labeller.initialize(get_examples, pipeline=pipeline)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
|
|
@ -183,7 +183,7 @@ cdef class Pipe:
|
||||||
"""
|
"""
|
||||||
return util.create_default_optimizer()
|
return util.create_default_optimizer()
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, pipeline=None, sgd=None):
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
This method needs to be implemented by each Pipe component,
|
This method needs to be implemented by each Pipe component,
|
||||||
ensuring the internal model (if available) is initialized properly
|
ensuring the internal model (if available) is initialized properly
|
||||||
|
@ -198,7 +198,7 @@ cdef class Pipe:
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#begin_training
|
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
||||||
|
|
||||||
|
|
|
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
|
||||||
else:
|
else:
|
||||||
self.punct_chars = set(self.default_punct_chars)
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
def initialize(self, get_examples, pipeline=None, sgd=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
|
|
|
@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, pipeline=None, sgd=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
|
@ -137,7 +137,7 @@ class SentenceRecognizer(Tagger):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
|
|
|
@ -256,7 +256,7 @@ class Tagger(Pipe):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
def initialize(self, get_examples, *, pipeline=None, sgd=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
|
@ -269,7 +269,7 @@ class Tagger(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#begin_training
|
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
|
|
|
@ -334,7 +334,7 @@ class TextCategorizer(Pipe):
|
||||||
self.labels = tuple(list(self.labels) + [label])
|
self.labels = tuple(list(self.labels) + [label])
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
|
@ -353,7 +353,7 @@ class TextCategorizer(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
|
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
subbatch = [] # Select a subbatch of examples to initialize the model
|
subbatch = [] # Select a subbatch of examples to initialize the model
|
||||||
|
|
|
@ -203,7 +203,7 @@ class Tok2Vec(Pipe):
|
||||||
def get_loss(self, examples, scores) -> None:
|
def get_loss(self, examples, scores) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
|
@ -222,7 +222,7 @@ class Tok2Vec(Pipe):
|
||||||
create_optimizer if it doesn't exist.
|
create_optimizer if it doesn't exist.
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
|
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
|
||||||
"""
|
"""
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
|
|
|
@ -405,7 +405,7 @@ cdef class Parser(Pipe):
|
||||||
def set_output(self, nO):
|
def set_output(self, nO):
|
||||||
self.model.attrs["resize_output"](self.model, nO)
|
self.model.attrs["resize_output"](self.model, nO)
|
||||||
|
|
||||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
self.cfg.update(kwargs)
|
self.cfg.update(kwargs)
|
||||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||||
|
|
|
@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(en_vocab, model, **config)
|
ner = EntityRecognizer(en_vocab, model, **config)
|
||||||
ner.begin_training(lambda: [_ner_example(ner)])
|
ner.initialize(lambda: [_ner_example(ner)])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
|
|
||||||
doc.ents = [("ANIMAL", 3, 4)]
|
doc.ents = [("ANIMAL", 3, 4)]
|
||||||
|
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(en_vocab, model, **config)
|
ner = EntityRecognizer(en_vocab, model, **config)
|
||||||
ner.begin_training(lambda: [_ner_example(ner)])
|
ner.initialize(lambda: [_ner_example(ner)])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
orig_iobs = [t.ent_iob_ for t in doc]
|
orig_iobs = [t.ent_iob_ for t in doc]
|
||||||
doc.ents = list(doc.ents)
|
doc.ents = list(doc.ents)
|
||||||
|
|
|
@ -35,7 +35,7 @@ def test_init_parser(parser):
|
||||||
def _train_parser(parser):
|
def _train_parser(parser):
|
||||||
fix_random_seed(1)
|
fix_random_seed(1)
|
||||||
parser.add_label("left")
|
parser.add_label("left")
|
||||||
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
|
parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
|
||||||
sgd = Adam(0.001)
|
sgd = Adam(0.001)
|
||||||
|
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
|
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
|
||||||
ner1.add_label("C")
|
ner1.add_label("C")
|
||||||
ner1.add_label("B")
|
ner1.add_label("B")
|
||||||
ner1.add_label("A")
|
ner1.add_label("A")
|
||||||
ner1.begin_training(lambda: [_ner_example(ner1)])
|
ner1.initialize(lambda: [_ner_example(ner1)])
|
||||||
ner2 = EntityRecognizer(Vocab(), model, **config)
|
ner2 = EntityRecognizer(Vocab(), model, **config)
|
||||||
|
|
||||||
# the second model needs to be resized before we can call from_bytes
|
# the second model needs to be resized before we can call from_bytes
|
||||||
|
|
|
@ -202,7 +202,7 @@ def test_train_empty():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
ner = nlp.add_pipe("ner", last=True)
|
ner = nlp.add_pipe("ner", last=True)
|
||||||
ner.add_label("PERSON")
|
ner.add_label("PERSON")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for itn in range(2):
|
for itn in range(2):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = util.minibatch(train_examples, size=8)
|
batches = util.minibatch(train_examples, size=8)
|
||||||
|
@ -213,7 +213,7 @@ def test_train_empty():
|
||||||
def test_overwrite_token():
|
def test_overwrite_token():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
# The untrained NER will predict O for each token
|
# The untrained NER will predict O for each token
|
||||||
doc = nlp("I live in New York")
|
doc = nlp("I live in New York")
|
||||||
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
||||||
|
@ -235,7 +235,7 @@ def test_empty_ner():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("MY_LABEL")
|
ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("John is watching the news about Croatia's elections")
|
doc = nlp("John is watching the news about Croatia's elections")
|
||||||
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
||||||
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
|
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
|
||||||
|
@ -254,7 +254,7 @@ def test_ruler_before_ner():
|
||||||
# 2: untrained NER - should set everything else to O
|
# 2: untrained NER - should set everything else to O
|
||||||
untrained_ner = nlp.add_pipe("ner")
|
untrained_ner = nlp.add_pipe("ner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||||
expected_types = ["THING", "", "", "", "", "", ""]
|
expected_types = ["THING", "", "", "", "", "", ""]
|
||||||
|
@ -269,7 +269,7 @@ def test_ner_before_ruler():
|
||||||
# 1: untrained NER - should set everything to O
|
# 1: untrained NER - should set everything to O
|
||||||
untrained_ner = nlp.add_pipe("ner", name="uner")
|
untrained_ner = nlp.add_pipe("ner", name="uner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
# 2 : Entity Ruler - should set "this" to B and keep everything else O
|
# 2 : Entity Ruler - should set "this" to B and keep everything else O
|
||||||
patterns = [{"label": "THING", "pattern": "This"}]
|
patterns = [{"label": "THING", "pattern": "This"}]
|
||||||
|
@ -290,7 +290,7 @@ def test_block_ner():
|
||||||
nlp.add_pipe("blocker", config={"start": 2, "end": 5})
|
nlp.add_pipe("blocker", config={"start": 2, "end": 5})
|
||||||
untrained_ner = nlp.add_pipe("ner")
|
untrained_ner = nlp.add_pipe("ner")
|
||||||
untrained_ner.add_label("MY_LABEL")
|
untrained_ner.add_label("MY_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("This is Antti L Korhonen speaking in Finland")
|
doc = nlp("This is Antti L Korhonen speaking in Finland")
|
||||||
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
|
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
|
||||||
expected_types = ["", "", "", "", "", "", "", ""]
|
expected_types = ["", "", "", "", "", "", "", ""]
|
||||||
|
@ -307,7 +307,7 @@ def test_overfitting_IO():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
for ent in annotations.get("entities"):
|
for ent in annotations.get("entities"):
|
||||||
ner.add_label(ent[2])
|
ner.add_label(ent[2])
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
losses = {}
|
losses = {}
|
||||||
|
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
|
||||||
assert not len(nlp.vocab.lookups)
|
assert not len(nlp.vocab.lookups)
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert "W033" in caplog.text
|
assert "W033" in caplog.text
|
||||||
caplog.clear()
|
caplog.clear()
|
||||||
nlp.vocab.lookups.add_table("lexeme_norm")
|
nlp.vocab.lookups.add_table("lexeme_norm")
|
||||||
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert "W033" not in caplog.text
|
assert "W033" not in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
@ -358,5 +358,5 @@ class BlockerComponent1:
|
||||||
self.name = name
|
self.name = name
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
|
doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
|
||||||
return doc
|
return doc
|
||||||
|
|
|
@ -191,7 +191,7 @@ def test_overfitting_IO():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
for dep in annotations.get("deps", []):
|
for dep in annotations.get("deps", []):
|
||||||
parser.add_label(dep)
|
parser.add_label(dep)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(100):
|
for i in range(100):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
|
|
@ -34,7 +34,7 @@ def parser(vocab):
|
||||||
parser.cfg["hidden_width"] = 32
|
parser.cfg["hidden_width"] = 32
|
||||||
# parser.add_label('right')
|
# parser.add_label('right')
|
||||||
parser.add_label("left")
|
parser.add_label("left")
|
||||||
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
|
parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
|
||||||
sgd = Adam(0.001)
|
sgd = Adam(0.001)
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
|
|
|
@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
|
||||||
"""Test that the EL can't train without defining a KB"""
|
"""Test that the EL can't train without defining a KB"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
entity_linker.begin_training(lambda: [])
|
entity_linker.initialize(lambda: [])
|
||||||
|
|
||||||
|
|
||||||
def test_kb_empty(nlp):
|
def test_kb_empty(nlp):
|
||||||
|
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
entity_linker.begin_training(lambda: [])
|
entity_linker.initialize(lambda: [])
|
||||||
|
|
||||||
|
|
||||||
def test_kb_serialize(nlp):
|
def test_kb_serialize(nlp):
|
||||||
|
@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp):
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert entity_linker.model.get_dim("nO") == vector_length
|
assert entity_linker.model.get_dim("nO") == vector_length
|
||||||
|
|
||||||
# test whether the entity links are preserved by the `as_doc()` function
|
# test whether the entity links are preserved by the `as_doc()` function
|
||||||
|
@ -463,7 +463,7 @@ def test_overfitting_IO():
|
||||||
)
|
)
|
||||||
|
|
||||||
# train the NEL pipe
|
# train the NEL pipe
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert entity_linker.model.get_dim("nO") == vector_length
|
assert entity_linker.model.get_dim("nO") == vector_length
|
||||||
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
|
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("morphologizer")
|
nlp.add_pipe("morphologizer")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_implicit_label():
|
def test_implicit_label():
|
||||||
|
@ -42,7 +42,7 @@ def test_implicit_label():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
def test_no_resize():
|
||||||
|
@ -50,13 +50,13 @@ def test_no_resize():
|
||||||
morphologizer = nlp.add_pipe("morphologizer")
|
morphologizer = nlp.add_pipe("morphologizer")
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
# this throws an error because the morphologizer can't be resized after initialization
|
# this throws an error because the morphologizer can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
morphologizer = nlp.add_pipe("morphologizer")
|
morphologizer = nlp.add_pipe("morphologizer")
|
||||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
||||||
|
@ -64,12 +64,12 @@ def test_begin_training_examples():
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -79,7 +79,7 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for inst in TRAIN_DATA:
|
for inst in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
losses = {}
|
losses = {}
|
||||||
|
|
|
@ -31,19 +31,19 @@ TRAIN_DATA = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("senter")
|
nlp.add_pipe("senter")
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -58,7 +58,7 @@ def test_overfitting_IO():
|
||||||
train_examples[1].reference[11].is_sent_start = False
|
train_examples[1].reference[11].is_sent_start = False
|
||||||
|
|
||||||
nlp.add_pipe("senter")
|
nlp.add_pipe("senter")
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
|
|
||||||
for i in range(200):
|
for i in range(200):
|
||||||
losses = {}
|
losses = {}
|
||||||
|
|
|
@ -15,14 +15,14 @@ def test_label_types():
|
||||||
tagger.add_label(9)
|
tagger.add_label(9)
|
||||||
|
|
||||||
|
|
||||||
def test_tagger_begin_training_tag_map():
|
def test_tagger_initialize_tag_map():
|
||||||
"""Test that Tagger.begin_training() without gold tuples does not clobber
|
"""Test that Tagger.initialize() without gold tuples does not clobber
|
||||||
the tag map."""
|
the tag map."""
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
orig_tag_count = len(tagger.labels)
|
orig_tag_count = len(tagger.labels)
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
|
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("tagger")
|
nlp.add_pipe("tagger")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
def test_no_resize():
|
||||||
|
@ -47,7 +47,7 @@ def test_no_resize():
|
||||||
tagger.add_label("N")
|
tagger.add_label("N")
|
||||||
tagger.add_label("V")
|
tagger.add_label("V")
|
||||||
assert tagger.labels == ("N", "V")
|
assert tagger.labels == ("N", "V")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert tagger.model.get_dim("nO") == 2
|
assert tagger.model.get_dim("nO") == 2
|
||||||
# this throws an error because the tagger can't be resized after initialization
|
# this throws an error because the tagger can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
@ -60,10 +60,10 @@ def test_implicit_label():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -72,16 +72,16 @@ def test_begin_training_examples():
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
nlp.begin_training(get_examples=lambda: train_examples[0])
|
nlp.initialize(get_examples=lambda: train_examples[0])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=lambda: [])
|
nlp.initialize(get_examples=lambda: [])
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -91,7 +91,7 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert tagger.model.get_dim("nO") == len(TAGS)
|
assert tagger.model.get_dim("nO") == len(TAGS)
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
|
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("tagger")
|
nlp.add_pipe("tagger")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
|
@ -26,7 +26,7 @@ def test_simple_train():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
textcat.add_label("answer")
|
textcat.add_label("answer")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
for text, answer in [
|
for text, answer in [
|
||||||
("aaaa", 1.0),
|
("aaaa", 1.0),
|
||||||
|
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
|
||||||
textcat = TextCategorizer(nlp.vocab, width=8)
|
textcat = TextCategorizer(nlp.vocab, width=8)
|
||||||
for letter in letters:
|
for letter in letters:
|
||||||
textcat.add_label(letter)
|
textcat.add_label(letter)
|
||||||
optimizer = textcat.begin_training(lambda: [])
|
optimizer = textcat.initialize(lambda: [])
|
||||||
for i in range(30):
|
for i in range(30):
|
||||||
losses = {}
|
losses = {}
|
||||||
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
||||||
|
@ -86,7 +86,7 @@ def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("textcat")
|
nlp.add_pipe("textcat")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_implicit_label():
|
def test_implicit_label():
|
||||||
|
@ -95,7 +95,7 @@ def test_implicit_label():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
def test_no_resize():
|
||||||
|
@ -103,14 +103,14 @@ def test_no_resize():
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
textcat.add_label("POSITIVE")
|
textcat.add_label("POSITIVE")
|
||||||
textcat.add_label("NEGATIVE")
|
textcat.add_label("NEGATIVE")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert textcat.model.get_dim("nO") == 2
|
assert textcat.model.get_dim("nO") == 2
|
||||||
# this throws an error because the textcat can't be resized after initialization
|
# this throws an error because the textcat can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
textcat.add_label("NEUTRAL")
|
textcat.add_label("NEUTRAL")
|
||||||
|
|
||||||
|
|
||||||
def test_begin_training_examples():
|
def test_initialize_examples():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -119,12 +119,12 @@ def test_begin_training_examples():
|
||||||
for label, value in annotations.get("cats").items():
|
for label, value in annotations.get("cats").items():
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
# you shouldn't really call this more than once, but for testing it should be fine
|
# you shouldn't really call this more than once, but for testing it should be fine
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp.begin_training(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
nlp.begin_training(get_examples=lambda: None)
|
nlp.initialize(get_examples=lambda: None)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(get_examples=train_examples)
|
nlp.initialize(get_examples=train_examples)
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
|
@ -139,7 +139,7 @@ def test_overfitting_IO():
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert textcat.model.get_dim("nO") == 2
|
assert textcat.model.get_dim("nO") == 2
|
||||||
|
|
||||||
for i in range(50):
|
for i in range(50):
|
||||||
|
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
for label, value in annotations.get("cats").items():
|
for label, value in annotations.get("cats").items():
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
|
|
@ -88,7 +88,7 @@ def test_init_tok2vec():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
tok2vec = nlp.add_pipe("tok2vec")
|
tok2vec = nlp.add_pipe("tok2vec")
|
||||||
assert tok2vec.listeners == []
|
assert tok2vec.listeners == []
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert tok2vec.model.get_dim("nO")
|
assert tok2vec.model.get_dim("nO")
|
||||||
|
|
||||||
|
|
||||||
|
@ -154,7 +154,7 @@ def test_tok2vec_listener():
|
||||||
|
|
||||||
# Check that the Tok2Vec component finds it listeners
|
# Check that the Tok2Vec component finds it listeners
|
||||||
assert tok2vec.listeners == []
|
assert tok2vec.listeners == []
|
||||||
optimizer = nlp.begin_training(lambda: train_examples)
|
optimizer = nlp.initialize(lambda: train_examples)
|
||||||
assert tok2vec.listeners == [tagger_tok2vec]
|
assert tok2vec.listeners == [tagger_tok2vec]
|
||||||
|
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
|
|
|
@ -428,7 +428,7 @@ def test_issue999():
|
||||||
for _, offsets in TRAIN_DATA:
|
for _, offsets in TRAIN_DATA:
|
||||||
for start, end, label in offsets:
|
for start, end, label in offsets:
|
||||||
ner.add_label(label)
|
ner.add_label(label)
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for itn in range(20):
|
for itn in range(20):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
for raw_text, entity_offsets in TRAIN_DATA:
|
for raw_text, entity_offsets in TRAIN_DATA:
|
||||||
|
|
|
@ -250,7 +250,7 @@ def test_issue1915():
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("answer")
|
ner.add_label("answer")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(**cfg)
|
nlp.initialize(**cfg)
|
||||||
|
|
||||||
|
|
||||||
def test_issue1945():
|
def test_issue1945():
|
||||||
|
|
|
@ -30,7 +30,7 @@ def test_issue2179():
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("CITIZENSHIP")
|
ner.add_label("CITIZENSHIP")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
nlp2 = Italian()
|
nlp2 = Italian()
|
||||||
nlp2.add_pipe("ner")
|
nlp2.add_pipe("ner")
|
||||||
assert len(nlp2.get_pipe("ner").labels) == 0
|
assert len(nlp2.get_pipe("ner").labels) == 0
|
||||||
|
|
|
@ -18,7 +18,7 @@ def test_issue2564():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
doc = nlp("hello world")
|
doc = nlp("hello world")
|
||||||
assert doc.has_annotation("TAG")
|
assert doc.has_annotation("TAG")
|
||||||
docs = nlp.pipe(["hello", "world"])
|
docs = nlp.pipe(["hello", "world"])
|
||||||
|
@ -149,7 +149,7 @@ def test_issue2800():
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
for entity_type in list(entity_types):
|
for entity_type in list(entity_types):
|
||||||
ner.add_label(entity_type)
|
ner.add_label(entity_type)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(20):
|
for i in range(20):
|
||||||
losses = {}
|
losses = {}
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
|
|
|
@ -92,7 +92,7 @@ def test_issue3209():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("ANIMAL")
|
ner.add_label("ANIMAL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
|
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
|
||||||
assert ner.move_names == move_names
|
assert ner.move_names == move_names
|
||||||
nlp2 = English()
|
nlp2 = English()
|
||||||
|
@ -239,7 +239,7 @@ def test_issue3456():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
list(nlp.pipe(["hi", ""]))
|
list(nlp.pipe(["hi", ""]))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -223,7 +223,7 @@ def test_issue3611():
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
# training the network
|
# training the network
|
||||||
with nlp.select_pipes(enable="textcat"):
|
with nlp.select_pipes(enable="textcat"):
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
@ -268,7 +268,7 @@ def test_issue3830_no_subtok():
|
||||||
parser = DependencyParser(Vocab(), model, **config)
|
parser = DependencyParser(Vocab(), model, **config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
parser.begin_training(lambda: [_parser_example(parser)])
|
parser.initialize(lambda: [_parser_example(parser)])
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
|
|
||||||
|
|
||||||
|
@ -283,7 +283,7 @@ def test_issue3830_with_subtok():
|
||||||
parser = DependencyParser(Vocab(), model, **config)
|
parser = DependencyParser(Vocab(), model, **config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
assert "subtok" not in parser.labels
|
assert "subtok" not in parser.labels
|
||||||
parser.begin_training(lambda: [_parser_example(parser)])
|
parser.initialize(lambda: [_parser_example(parser)])
|
||||||
assert "subtok" in parser.labels
|
assert "subtok" in parser.labels
|
||||||
|
|
||||||
|
|
||||||
|
@ -342,7 +342,7 @@ def test_issue3880():
|
||||||
nlp.add_pipe("parser").add_label("dep")
|
nlp.add_pipe("parser").add_label("dep")
|
||||||
nlp.add_pipe("ner").add_label("PERSON")
|
nlp.add_pipe("ner").add_label("PERSON")
|
||||||
nlp.add_pipe("tagger").add_label("NN")
|
nlp.add_pipe("tagger").add_label("NN")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for doc in nlp.pipe(texts):
|
for doc in nlp.pipe(texts):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -66,7 +66,7 @@ def test_issue4030():
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
# training the network
|
# training the network
|
||||||
with nlp.select_pipes(enable="textcat"):
|
with nlp.select_pipes(enable="textcat"):
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
@ -87,7 +87,7 @@ def test_issue4042():
|
||||||
# add ner pipe
|
# add ner pipe
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("SOME_LABEL")
|
ner.add_label("SOME_LABEL")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
# Add entity ruler
|
# Add entity ruler
|
||||||
patterns = [
|
patterns = [
|
||||||
{"label": "MY_ORG", "pattern": "Apple"},
|
{"label": "MY_ORG", "pattern": "Apple"},
|
||||||
|
@ -118,7 +118,7 @@ def test_issue4042_bug2():
|
||||||
# add ner pipe
|
# add ner pipe
|
||||||
ner1 = nlp1.add_pipe("ner")
|
ner1 = nlp1.add_pipe("ner")
|
||||||
ner1.add_label("SOME_LABEL")
|
ner1.add_label("SOME_LABEL")
|
||||||
nlp1.begin_training()
|
nlp1.initialize()
|
||||||
# add a new label to the doc
|
# add a new label to the doc
|
||||||
doc1 = nlp1("What do you think about Apple ?")
|
doc1 = nlp1("What do you think about Apple ?")
|
||||||
assert len(ner1.labels) == 1
|
assert len(ner1.labels) == 1
|
||||||
|
@ -244,7 +244,7 @@ def test_issue4267():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = nlp.add_pipe("ner")
|
ner = nlp.add_pipe("ner")
|
||||||
ner.add_label("PEOPLE")
|
ner.add_label("PEOPLE")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert "ner" in nlp.pipe_names
|
assert "ner" in nlp.pipe_names
|
||||||
# assert that we have correct IOB annotations
|
# assert that we have correct IOB annotations
|
||||||
doc1 = nlp("hi")
|
doc1 = nlp("hi")
|
||||||
|
@ -299,7 +299,7 @@ def test_issue4313():
|
||||||
config = {}
|
config = {}
|
||||||
ner = nlp.create_pipe("ner", config=config)
|
ner = nlp.create_pipe("ner", config=config)
|
||||||
ner.add_label("SOME_LABEL")
|
ner.add_label("SOME_LABEL")
|
||||||
ner.begin_training(lambda: [])
|
ner.initialize(lambda: [])
|
||||||
# add a new label to the doc
|
# add a new label to the doc
|
||||||
doc = nlp("What do you think about Apple ?")
|
doc = nlp("What do you think about Apple ?")
|
||||||
assert len(ner.labels) == 1
|
assert len(ner.labels) == 1
|
||||||
|
@ -327,7 +327,7 @@ def test_issue4348():
|
||||||
TRAIN_DATA = [example, example]
|
TRAIN_DATA = [example, example]
|
||||||
tagger = nlp.add_pipe("tagger")
|
tagger = nlp.add_pipe("tagger")
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
|
@ -180,7 +180,7 @@ def test_issue4725_2():
|
||||||
vocab.set_vector("dog", data[1])
|
vocab.set_vector("dog", data[1])
|
||||||
nlp = English(vocab=vocab)
|
nlp = English(vocab=vocab)
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
docs = ["Kurt is in London."] * 10
|
docs = ["Kurt is in London."] * 10
|
||||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -64,7 +64,7 @@ def tagger():
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
# 2. the affected line is the one for model serialization
|
# 2. the affected line is the one for model serialization
|
||||||
tagger.add_label("A")
|
tagger.add_label("A")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
return tagger
|
return tagger
|
||||||
|
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@ def entity_linker():
|
||||||
# need to add model for two reasons:
|
# need to add model for two reasons:
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
# 2. the affected line is the one for model serialization
|
# 2. the affected line is the one for model serialization
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
return entity_linker
|
return entity_linker
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ def test_issue5551():
|
||||||
pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
|
pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
|
||||||
for label in set(example[1]["cats"]):
|
for label in set(example[1]["cats"]):
|
||||||
pipe.add_label(label)
|
pipe.add_label(label)
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
# Store the result of each iteration
|
# Store the result of each iteration
|
||||||
result = pipe.model.predict([nlp.make_doc(example[0])])
|
result = pipe.model.predict([nlp.make_doc(example[0])])
|
||||||
|
|
|
@ -152,7 +152,7 @@ def test_serialize_nlp():
|
||||||
nlp_config = Config().from_str(nlp_config_string)
|
nlp_config = Config().from_str(nlp_config_string)
|
||||||
nlp = load_model_from_config(nlp_config, auto_fill=True)
|
nlp = load_model_from_config(nlp_config, auto_fill=True)
|
||||||
nlp.get_pipe("tagger").add_label("A")
|
nlp.get_pipe("tagger").add_label("A")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
assert "tok2vec" in nlp.pipe_names
|
assert "tok2vec" in nlp.pipe_names
|
||||||
assert "tagger" in nlp.pipe_names
|
assert "tagger" in nlp.pipe_names
|
||||||
assert "parser" not in nlp.pipe_names
|
assert "parser" not in nlp.pipe_names
|
||||||
|
@ -173,7 +173,7 @@ def test_serialize_custom_nlp():
|
||||||
parser_cfg = dict()
|
parser_cfg = dict()
|
||||||
parser_cfg["model"] = {"@architectures": "my_test_parser"}
|
parser_cfg["model"] = {"@architectures": "my_test_parser"}
|
||||||
nlp.add_pipe("parser", config=parser_cfg)
|
nlp.add_pipe("parser", config=parser_cfg)
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
nlp.to_disk(d)
|
nlp.to_disk(d)
|
||||||
|
@ -191,7 +191,7 @@ def test_serialize_parser():
|
||||||
model_config = Config().from_str(parser_config_string)
|
model_config = Config().from_str(parser_config_string)
|
||||||
parser = nlp.add_pipe("parser", config=model_config)
|
parser = nlp.add_pipe("parser", config=model_config)
|
||||||
parser.add_label("nsubj")
|
parser.add_label("nsubj")
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
nlp.to_disk(d)
|
nlp.to_disk(d)
|
||||||
|
|
|
@ -18,7 +18,7 @@ def nlp():
|
||||||
textcat = nlp.add_pipe("textcat")
|
textcat = nlp.add_pipe("textcat")
|
||||||
for label in ("POSITIVE", "NEGATIVE"):
|
for label in ("POSITIVE", "NEGATIVE"):
|
||||||
textcat.add_label(label)
|
textcat.add_label(label)
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ def test_readers():
|
||||||
)
|
)
|
||||||
optimizer = T["optimizer"]
|
optimizer = T["optimizer"]
|
||||||
# simulate a training loop
|
# simulate a training loop
|
||||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
for example in train_corpus(nlp):
|
for example in train_corpus(nlp):
|
||||||
nlp.update([example], sgd=optimizer)
|
nlp.update([example], sgd=optimizer)
|
||||||
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
||||||
|
@ -99,7 +99,7 @@ def test_cat_readers(reader, additional_config):
|
||||||
)
|
)
|
||||||
optimizer = T["optimizer"]
|
optimizer = T["optimizer"]
|
||||||
# simulate a training loop
|
# simulate a training loop
|
||||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
for example in train_corpus(nlp):
|
for example in train_corpus(nlp):
|
||||||
assert example.y.cats
|
assert example.y.cats
|
||||||
# this shouldn't fail if each training example has at least one positive label
|
# this shouldn't fail if each training example has at least one positive label
|
||||||
|
|
|
@ -600,7 +600,7 @@ def _train_tuples(train_data):
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in train_data:
|
for t in train_data:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
|
@ -49,9 +49,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
|
||||||
msg.info(f"Resuming training for: {resume_components}")
|
msg.info(f"Resuming training for: {resume_components}")
|
||||||
nlp.resume_training(sgd=optimizer)
|
nlp.resume_training(sgd=optimizer)
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
msg.good(f"Initialized pipeline components")
|
msg.good(f"Initialized pipeline components")
|
||||||
# Verify the config after calling 'begin_training' to ensure labels
|
# Verify the config after calling 'initialize' to ensure labels
|
||||||
# are properly initialized
|
# are properly initialized
|
||||||
verify_config(nlp)
|
verify_config(nlp)
|
||||||
if "pretraining" in config and config["pretraining"]:
|
if "pretraining" in config and config["pretraining"]:
|
||||||
|
|
|
@ -517,18 +517,18 @@ specific data and challenge.
|
||||||
Stacked ensemble of a bag-of-words model and a neural network model. The neural
|
Stacked ensemble of a bag-of-words model and a neural network model. The neural
|
||||||
network has an internal CNN Tok2Vec layer and uses attention.
|
network has an internal CNN Tok2Vec layer and uses attention.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||||
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
|
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
|
||||||
| `width` | Output dimension of the feature encoding step. ~~int~~ |
|
| `width` | Output dimension of the feature encoding step. ~~int~~ |
|
||||||
| `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
|
| `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
|
||||||
| `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
|
| `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
|
||||||
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
|
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
|
||||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||||
| `dropout` | The dropout rate. ~~float~~ |
|
| `dropout` | The dropout rate. ~~float~~ |
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
||||||
|
|
||||||
|
@ -555,12 +555,12 @@ A neural network model where token vectors are calculated using a CNN. The
|
||||||
vectors are mean pooled and used as features in a feed-forward network. This
|
vectors are mean pooled and used as features in a feed-forward network. This
|
||||||
architecture is usually less accurate than the ensemble, but runs faster.
|
architecture is usually less accurate than the ensemble, but runs faster.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
### spacy.TextCatBOW.v1 {#TextCatBOW}
|
### spacy.TextCatBOW.v1 {#TextCatBOW}
|
||||||
|
|
||||||
|
@ -578,13 +578,13 @@ architecture is usually less accurate than the ensemble, but runs faster.
|
||||||
An ngram "bag-of-words" model. This architecture should run much faster than the
|
An ngram "bag-of-words" model. This architecture should run much faster than the
|
||||||
others, but may not be as accurate, especially if texts are short.
|
others, but may not be as accurate, especially if texts are short.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
|
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
|
||||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
||||||
|
|
||||||
|
@ -629,11 +629,11 @@ into the "real world". This requires 3 main components:
|
||||||
The `EntityLinker` model architecture is a Thinc `Model` with a
|
The `EntityLinker` model architecture is a Thinc `Model` with a
|
||||||
[`Linear`](https://thinc.ai/api-layers#linear) output layer.
|
[`Linear`](https://thinc.ai/api-layers#linear) output layer.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
### spacy.EmptyKB.v1 {#EmptyKB}
|
### spacy.EmptyKB.v1 {#EmptyKB}
|
||||||
|
|
||||||
|
|
|
@ -140,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## DependencyParser.begin_training {#begin_training tag="method"}
|
## DependencyParser.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
@ -151,11 +151,17 @@ validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data.
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
This method was previously called `begin_training`.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = nlp.add_pipe("parser")
|
> parser = nlp.add_pipe("parser")
|
||||||
> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
|
> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -210,7 +216,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> parser = nlp.add_pipe("parser")
|
> parser = nlp.add_pipe("parser")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.initialize()
|
||||||
> losses = parser.update(examples, sgd=optimizer)
|
> losses = parser.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -294,11 +300,10 @@ context, the original parameters are restored.
|
||||||
## DependencyParser.add_label {#add_label tag="method"}
|
## DependencyParser.add_label {#add_label tag="method"}
|
||||||
|
|
||||||
Add a new label to the pipe. Note that you don't have to call this method if you
|
Add a new label to the pipe. Note that you don't have to call this method if you
|
||||||
provide a **representative data sample** to the
|
provide a **representative data sample** to the [`initialize`](#initialize)
|
||||||
[`begin_training`](#begin_training) method. In this case, all labels found in
|
method. In this case, all labels found in the sample will be automatically added
|
||||||
the sample will be automatically added to the model, and the output dimension
|
to the model, and the output dimension will be
|
||||||
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||||
automatically.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## EntityLinker.begin_training {#begin_training tag="method"}
|
## EntityLinker.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
@ -150,11 +150,17 @@ validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data.
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
This method was previously called `begin_training`.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = nlp.add_pipe("entity_linker", last=True)
|
> entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||||
> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
|
> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -211,7 +217,7 @@ pipe's entity linking model and context encoder. Delegates to
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> entity_linker = nlp.add_pipe("entity_linker")
|
> entity_linker = nlp.add_pipe("entity_linker")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.initialize()
|
||||||
> losses = entity_linker.update(examples, sgd=optimizer)
|
> losses = entity_linker.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
## EntityRecognizer.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
@ -140,11 +140,17 @@ validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data.
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
This method was previously called `begin_training`.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = nlp.add_pipe("ner")
|
> ner = nlp.add_pipe("ner")
|
||||||
> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
|
> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -199,7 +205,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> ner = nlp.add_pipe("ner")
|
> ner = nlp.add_pipe("ner")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.initialize()
|
||||||
> losses = ner.update(examples, sgd=optimizer)
|
> losses = ner.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -282,11 +288,10 @@ context, the original parameters are restored.
|
||||||
## EntityRecognizer.add_label {#add_label tag="method"}
|
## EntityRecognizer.add_label {#add_label tag="method"}
|
||||||
|
|
||||||
Add a new label to the pipe. Note that you don't have to call this method if you
|
Add a new label to the pipe. Note that you don't have to call this method if you
|
||||||
provide a **representative data sample** to the
|
provide a **representative data sample** to the [`initialize`](#initialize)
|
||||||
[`begin_training`](#begin_training) method. In this case, all labels found in
|
method. In this case, all labels found in the sample will be automatically added
|
||||||
the sample will be automatically added to the model, and the output dimension
|
to the model, and the output dimension will be
|
||||||
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||||
automatically.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -201,30 +201,31 @@ more efficient than processing texts one-by-one.
|
||||||
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||||
|
|
||||||
## Language.begin_training {#begin_training tag="method"}
|
## Language.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the pipeline for training and return an
|
Initialize the pipeline for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
function that returns an iterable of [`Example`](/api/example) objects. The data
|
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||||
examples can either be the full training data or a representative sample. They
|
examples can either be the full training data or a representative sample. They
|
||||||
are used to **initialize the models** of trainable pipeline components and are
|
are used to **initialize the models** of trainable pipeline components and are
|
||||||
passed each component's [`begin_training`](/api/pipe#begin_training) method, if
|
passed each component's [`initialize`](/api/pipe#initialize) method, if
|
||||||
available. Initialization includes validating the network,
|
available. Initialization includes validating the network,
|
||||||
[inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
|
[inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
|
||||||
and setting up the label scheme based on the data.
|
and setting up the label scheme based on the data.
|
||||||
|
|
||||||
If no `get_examples` function is provided when calling `nlp.begin_training`, the
|
If no `get_examples` function is provided when calling `nlp.initialize`, the
|
||||||
pipeline components will be initialized with generic data. In this case, it is
|
pipeline components will be initialized with generic data. In this case, it is
|
||||||
crucial that the output dimension of each component has already been defined
|
crucial that the output dimension of each component has already been defined
|
||||||
either in the [config](/usage/training#config), or by calling
|
either in the [config](/usage/training#config), or by calling
|
||||||
[`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
|
[`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
|
||||||
the tagger or textcat).
|
the tagger or textcat).
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0">
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
The `Language.update` method now takes a **function** that is called with no
|
This method was previously called `begin_training`. It now also takes a
|
||||||
arguments and returns a sequence of [`Example`](/api/example) objects instead of
|
**function** that is called with no arguments and returns a sequence of
|
||||||
tuples of `Doc` and `GoldParse` objects.
|
[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
|
||||||
|
objects.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -232,7 +233,7 @@ tuples of `Doc` and `GoldParse` objects.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> get_examples = lambda: examples
|
> get_examples = lambda: examples
|
||||||
> optimizer = nlp.begin_training(get_examples)
|
> optimizer = nlp.initialize(get_examples)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -636,13 +637,13 @@ list, will be disabled. Under the hood, this method calls into
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> with nlp.select_pipes(disable=["tagger", "parser"]):
|
> with nlp.select_pipes(disable=["tagger", "parser"]):
|
||||||
> nlp.begin_training()
|
> nlp.initialize()
|
||||||
>
|
>
|
||||||
> with nlp.select_pipes(enable="ner"):
|
> with nlp.select_pipes(enable="ner"):
|
||||||
> nlp.begin_training()
|
> nlp.initialize()
|
||||||
>
|
>
|
||||||
> disabled = nlp.select_pipes(disable=["tagger", "parser"])
|
> disabled = nlp.select_pipes(disable=["tagger", "parser"])
|
||||||
> nlp.begin_training()
|
> nlp.initialize()
|
||||||
> disabled.restore()
|
> disabled.restore()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -117,7 +117,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Morphologizer.begin_training {#begin_training tag="method"}
|
## Morphologizer.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
@ -133,7 +133,7 @@ setting up the label scheme based on the data.
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = nlp.add_pipe("morphologizer")
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> nlp.pipeline.append(morphologizer)
|
> nlp.pipeline.append(morphologizer)
|
||||||
> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
|
> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -189,7 +189,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> morphologizer = nlp.add_pipe("morphologizer")
|
> morphologizer = nlp.add_pipe("morphologizer")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.initialize()
|
||||||
> losses = morphologizer.update(examples, sgd=optimizer)
|
> losses = morphologizer.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -259,12 +259,11 @@ context, the original parameters are restored.
|
||||||
Add a new label to the pipe. If the `Morphologizer` should set annotations for
|
Add a new label to the pipe. If the `Morphologizer` should set annotations for
|
||||||
both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
|
both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
|
||||||
Raises an error if the output dimension is already set, or if the model has
|
Raises an error if the output dimension is already set, or if the model has
|
||||||
already been fully [initialized](#begin_training). Note that you don't have to
|
already been fully [initialized](#initialize). Note that you don't have to call
|
||||||
call this method if you provide a **representative data sample** to the
|
this method if you provide a **representative data sample** to the
|
||||||
[`begin_training`](#begin_training) method. In this case, all labels found in
|
[`initialize`](#initialize) method. In this case, all labels found in the sample
|
||||||
the sample will be automatically added to the model, and the output dimension
|
will be automatically added to the model, and the output dimension will be
|
||||||
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||||
automatically.
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Pipe.begin_training {#begin_training tag="method"}
|
## Pipe.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
@ -109,11 +109,17 @@ validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data.
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
This method was previously called `begin_training`.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> pipe = nlp.add_pipe("your_custom_pipe")
|
> pipe = nlp.add_pipe("your_custom_pipe")
|
||||||
> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
|
> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -180,7 +186,7 @@ predictions and gold-standard annotations, and update the component's model.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> pipe = nlp.add_pipe("your_custom_pipe")
|
> pipe = nlp.add_pipe("your_custom_pipe")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.initialize()
|
||||||
> losses = pipe.update(examples, sgd=optimizer)
|
> losses = pipe.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -296,9 +302,9 @@ context, the original parameters are restored.
|
||||||
Add a new label to the pipe, to be predicted by the model. The actual
|
Add a new label to the pipe, to be predicted by the model. The actual
|
||||||
implementation depends on the specific component, but in general `add_label`
|
implementation depends on the specific component, but in general `add_label`
|
||||||
shouldn't be called if the output dimension is already set, or if the model has
|
shouldn't be called if the output dimension is already set, or if the model has
|
||||||
already been fully [initialized](#begin_training). If these conditions are
|
already been fully [initialized](#initialize). If these conditions are violated,
|
||||||
violated, the function will raise an Error. The exception to this rule is when
|
the function will raise an Error. The exception to this rule is when the
|
||||||
the component is [resizable](#is_resizable), in which case
|
component is [resizable](#is_resizable), in which case
|
||||||
[`set_output`](#set_output) should be called to ensure that the model is
|
[`set_output`](#set_output) should be called to ensure that the model is
|
||||||
properly resized.
|
properly resized.
|
||||||
|
|
||||||
|
@ -314,9 +320,9 @@ This method needs to be overwritten with your own custom `add_label` method.
|
||||||
| **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |
|
| **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |
|
||||||
|
|
||||||
Note that in general, you don't have to call `pipe.add_label` if you provide a
|
Note that in general, you don't have to call `pipe.add_label` if you provide a
|
||||||
representative data sample to the [`begin_training`](#begin_training) method. In
|
representative data sample to the [`initialize`](#initialize) method. In this
|
||||||
this case, all labels found in the sample will be automatically added to the
|
case, all labels found in the sample will be automatically added to the model,
|
||||||
model, and the output dimension will be
|
and the output dimension will be
|
||||||
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||||
|
|
||||||
## Pipe.is_resizable {#is_resizable tag="method"}
|
## Pipe.is_resizable {#is_resizable tag="method"}
|
||||||
|
|
|
@ -114,7 +114,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## SentenceRecognizer.begin_training {#begin_training tag="method"}
|
## SentenceRecognizer.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
@ -129,7 +129,7 @@ setting up the label scheme based on the data.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> senter = nlp.add_pipe("senter")
|
> senter = nlp.add_pipe("senter")
|
||||||
> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
|
> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -185,7 +185,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> senter = nlp.add_pipe("senter")
|
> senter = nlp.add_pipe("senter")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.initialize()
|
||||||
> losses = senter.update(examples, sgd=optimizer)
|
> losses = senter.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Tagger.begin_training {#begin_training tag="method"}
|
## Tagger.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
@ -123,11 +123,17 @@ validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data.
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
This method was previously called `begin_training`.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = nlp.add_pipe("tagger")
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
|
> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -183,7 +189,7 @@ Delegates to [`predict`](/api/tagger#predict) and
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tagger = nlp.add_pipe("tagger")
|
> tagger = nlp.add_pipe("tagger")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.initialize()
|
||||||
> losses = tagger.update(examples, sgd=optimizer)
|
> losses = tagger.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -289,12 +295,12 @@ context, the original parameters are restored.
|
||||||
## Tagger.add_label {#add_label tag="method"}
|
## Tagger.add_label {#add_label tag="method"}
|
||||||
|
|
||||||
Add a new label to the pipe. Raises an error if the output dimension is already
|
Add a new label to the pipe. Raises an error if the output dimension is already
|
||||||
set, or if the model has already been fully [initialized](#begin_training). Note
|
set, or if the model has already been fully [initialized](#initialize). Note
|
||||||
that you don't have to call this method if you provide a **representative data
|
that you don't have to call this method if you provide a **representative data
|
||||||
sample** to the [`begin_training`](#begin_training) method. In this case, all
|
sample** to the [`initialize`](#initialize) method. In this case, all labels
|
||||||
labels found in the sample will be automatically added to the model, and the
|
found in the sample will be automatically added to the model, and the output
|
||||||
output dimension will be
|
dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
||||||
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
automatically.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## TextCategorizer.begin_training {#begin_training tag="method"}
|
## TextCategorizer.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
@ -136,11 +136,17 @@ validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme based on the data.
|
setting up the label scheme based on the data.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||||
|
|
||||||
|
This method was previously called `begin_training`.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = nlp.add_pipe("textcat")
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
|
> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -196,14 +202,14 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> textcat = nlp.add_pipe("textcat")
|
> textcat = nlp.add_pipe("textcat")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.initialize()
|
||||||
> losses = textcat.update(examples, sgd=optimizer)
|
> losses = textcat.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
|
@ -227,7 +233,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
|
@ -303,12 +309,12 @@ Modify the pipe's model, to use the given parameter values.
|
||||||
## TextCategorizer.add_label {#add_label tag="method"}
|
## TextCategorizer.add_label {#add_label tag="method"}
|
||||||
|
|
||||||
Add a new label to the pipe. Raises an error if the output dimension is already
|
Add a new label to the pipe. Raises an error if the output dimension is already
|
||||||
set, or if the model has already been fully [initialized](#begin_training). Note
|
set, or if the model has already been fully [initialized](#initialize). Note
|
||||||
that you don't have to call this method if you provide a **representative data
|
that you don't have to call this method if you provide a **representative data
|
||||||
sample** to the [`begin_training`](#begin_training) method. In this case, all
|
sample** to the [`initialize`](#initialize) method. In this case, all labels
|
||||||
labels found in the sample will be automatically added to the model, and the
|
found in the sample will be automatically added to the model, and the output
|
||||||
output dimension will be
|
dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
||||||
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
automatically.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Tok2Vec.begin_training {#begin_training tag="method"}
|
## Tok2Vec.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
@ -138,7 +138,7 @@ setting up the label scheme based on the data.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tok2vec = nlp.add_pipe("tok2vec")
|
> tok2vec = nlp.add_pipe("tok2vec")
|
||||||
> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
|
> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -193,7 +193,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> tok2vec = nlp.add_pipe("tok2vec")
|
> tok2vec = nlp.add_pipe("tok2vec")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.initialize()
|
||||||
> losses = tok2vec.update(examples, sgd=optimizer)
|
> losses = tok2vec.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
|
||||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
## Transformer.begin_training {#begin_training tag="method"}
|
## Transformer.initialize {#initialize tag="method"}
|
||||||
|
|
||||||
Initialize the component for training and return an
|
Initialize the component for training and return an
|
||||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||||
|
@ -173,7 +173,7 @@ setting up the label scheme based on the data.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> trf = nlp.add_pipe("transformer")
|
> trf = nlp.add_pipe("transformer")
|
||||||
> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
|
> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -241,7 +241,7 @@ and call the optimizer, while the others simply increment the gradients.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> trf = nlp.add_pipe("transformer")
|
> trf = nlp.add_pipe("transformer")
|
||||||
> optimizer = nlp.begin_training()
|
> optimizer = nlp.initialize()
|
||||||
> losses = trf.update(examples, sgd=optimizer)
|
> losses = trf.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure
|
||||||
that their internal models are **always initialized** with appropriate sample
|
that their internal models are **always initialized** with appropriate sample
|
||||||
data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
|
data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
|
||||||
~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
|
~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
|
||||||
functionality is triggered when
|
functionality is triggered when [`nlp.initialize`](/api/language#initialize) is
|
||||||
[`nlp.begin_training`](/api/language#begin_training) is called.
|
called.
|
||||||
|
|
||||||
### Dropout and normalization in Thinc {#thinc-dropout-norm}
|
### Dropout and normalization in Thinc {#thinc-dropout-norm}
|
||||||
|
|
||||||
|
@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}):
|
||||||
|
|
||||||
<!-- TODO: write trainable component section
|
<!-- TODO: write trainable component section
|
||||||
- Interaction with `predict`, `get_loss` and `set_annotations`
|
- Interaction with `predict`, `get_loss` and `set_annotations`
|
||||||
- Initialization life-cycle with `begin_training`, correlation with add_label
|
- Initialization life-cycle with `initialize`, correlation with add_label
|
||||||
Example: relation extraction component (implemented as project template)
|
Example: relation extraction component (implemented as project template)
|
||||||
Avoid duplication with usage/processing-pipelines#trainable-components ?
|
Avoid duplication with usage/processing-pipelines#trainable-components ?
|
||||||
-->
|
-->
|
||||||
|
|
|
@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
|
||||||
customize how the model is updated from examples, how it's initialized, how the
|
customize how the model is updated from examples, how it's initialized, how the
|
||||||
loss is calculated and to add evaluation scores to the training output.
|
loss is calculated and to add evaluation scores to the training output.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
||||||
| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
|
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
|
||||||
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
||||||
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
||||||
|
|
||||||
<Infobox title="Custom trainable components and models" emoji="📖">
|
<Infobox title="Custom trainable components and models" emoji="📖">
|
||||||
|
|
||||||
|
|
|
@ -1045,8 +1045,8 @@ of being dropped.
|
||||||
|
|
||||||
> - [`nlp`](/api/language): The `nlp` object with the pipeline components and
|
> - [`nlp`](/api/language): The `nlp` object with the pipeline components and
|
||||||
> their models.
|
> their models.
|
||||||
> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
|
> - [`nlp.initialize`](/api/language#initialize): Start the training and return
|
||||||
> return an optimizer to update the component model weights.
|
> an optimizer to update the component model weights.
|
||||||
> - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
|
> - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
|
||||||
> state between updates.
|
> state between updates.
|
||||||
> - [`nlp.update`](/api/language#update): Update component models with examples.
|
> - [`nlp.update`](/api/language#update): Update component models with examples.
|
||||||
|
@ -1057,7 +1057,7 @@ of being dropped.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### Example training loop
|
### Example training loop
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.initialize()
|
||||||
for itn in range(100):
|
for itn in range(100):
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
for raw_text, entity_offsets in train_data:
|
for raw_text, entity_offsets in train_data:
|
||||||
|
|
|
@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
[`Pipe.update`](/api/pipe#update) methods now all take batches of
|
[`Pipe.update`](/api/pipe#update) methods now all take batches of
|
||||||
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
||||||
raw text and a dictionary of annotations.
|
raw text and a dictionary of annotations.
|
||||||
[`Language.begin_training`](/api/language#begin_training) and
|
[`Language.initialize`](/api/language#initialize) and
|
||||||
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
|
[`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
|
||||||
returns a sequence of `Example` objects to initialize the model instead of a
|
sequence of `Example` objects to initialize the model instead of a list of
|
||||||
list of tuples.
|
tuples.
|
||||||
|
- The `begin_training` methods have been renamed to `initialize`.
|
||||||
- [`Matcher.add`](/api/matcher#add) and
|
- [`Matcher.add`](/api/matcher#add) and
|
||||||
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
|
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
|
||||||
patterns as the second argument (instead of a variable number of arguments).
|
patterns as the second argument (instead of a variable number of arguments).
|
||||||
|
@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
| Removed | Replacement |
|
| Removed | Replacement |
|
||||||
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
|
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
|
||||||
|
| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... |
|
||||||
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
|
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
|
||||||
| `GoldParse` | [`Example`](/api/example) |
|
| `GoldParse` | [`Example`](/api/example) |
|
||||||
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
||||||
|
@ -936,7 +938,7 @@ TRAIN_DATA = [
|
||||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||||
("I like London.", {"entities": [(7, 13, "LOC")]}),
|
("I like London.", {"entities": [(7, 13, "LOC")]}),
|
||||||
]
|
]
|
||||||
nlp.begin_training()
|
nlp.initialize()
|
||||||
for i in range(20):
|
for i in range(20):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
for batch in minibatch(TRAIN_DATA):
|
for batch in minibatch(TRAIN_DATA):
|
||||||
|
@ -946,17 +948,18 @@ for i in range(20):
|
||||||
nlp.update(examples)
|
nlp.update(examples)
|
||||||
```
|
```
|
||||||
|
|
||||||
[`Language.begin_training`](/api/language#begin_training) and
|
`Language.begin_training` and `Pipe.begin_training` have been renamed to
|
||||||
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
|
[`Language.initialize`](/api/language#initialize) and
|
||||||
returns a sequence of `Example` objects to initialize the model instead of a
|
[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
|
||||||
list of tuples. The data examples are used to **initialize the models** of
|
that returns a sequence of `Example` objects to initialize the model instead of
|
||||||
|
a list of tuples. The data examples are used to **initialize the models** of
|
||||||
trainable pipeline components, which includes validating the network,
|
trainable pipeline components, which includes validating the network,
|
||||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||||
setting up the label scheme.
|
setting up the label scheme.
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
- nlp.begin_training(examples)
|
- nlp.initialize(examples)
|
||||||
+ nlp.begin_training(lambda: examples)
|
+ nlp.initialize(lambda: examples)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Packaging trained pipelines {#migrating-training-packaging}
|
#### Packaging trained pipelines {#migrating-training-packaging}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user