Clean up sgd and pipeline -> nlp

This commit is contained in:
Ines Montani 2020-09-29 12:20:26 +02:00
parent 612bbf85ab
commit f171903139
11 changed files with 28 additions and 58 deletions

View File

@ -1,5 +1,5 @@
from itertools import islice from itertools import islice
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
from pathlib import Path from pathlib import Path
import srsly import srsly
import random import random
@ -144,20 +144,14 @@ class EntityLinker(Pipe):
self, self,
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, nlp: Optional[Language] = None,
sgd: Optional[Optimizer] = None, ):
) -> Optimizer:
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/entitylinker#initialize DOCS: https://nightly.spacy.io/api/entitylinker#initialize
""" """
@ -174,9 +168,6 @@ class EntityLinker(Pipe):
self.model.initialize( self.model.initialize(
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
) )
if sgd is None:
sgd = self.create_optimizer()
return sgd
def update( def update(
self, self,

View File

@ -129,16 +129,13 @@ class Morphologizer(Tagger):
self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
return 1 return 1
def initialize(self, get_examples, *, pipeline=None): def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/morphologizer#initialize DOCS: https://nightly.spacy.io/api/morphologizer#initialize
""" """

View File

@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids): def set_annotations(self, docs, dep_ids):
pass pass
def initialize(self, get_examples, pipeline=None): def initialize(self, get_examples, nlp=None):
if not hasattr(get_examples, "__call__"): if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples)) err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
raise ValueError(err) raise ValueError(err)
@ -174,7 +174,7 @@ class ClozeMultitask(Pipe):
def set_annotations(self, docs, dep_ids): def set_annotations(self, docs, dep_ids):
pass pass
def initialize(self, get_examples, pipeline=None): def initialize(self, get_examples, nlp=None):
self.model.initialize() # TODO: fix initialization by defining X and Y self.model.initialize() # TODO: fix initialization by defining X and Y
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.initialize(X) self.model.output_layer.initialize(X)

View File

@ -183,7 +183,7 @@ cdef class Pipe:
""" """
return util.create_default_optimizer() return util.create_default_optimizer()
def initialize(self, get_examples, *, pipeline=None): def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using data examples if available.
This method needs to be implemented by each Pipe component, This method needs to be implemented by each Pipe component,
ensuring the internal model (if available) is initialized properly ensuring the internal model (if available) is initialized properly
@ -191,14 +191,11 @@ cdef class Pipe:
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/pipe#initialize DOCS: https://nightly.spacy.io/api/pipe#initialize
""" """
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))
def _ensure_examples(self, get_examples): def _ensure_examples(self, get_examples):
if get_examples is None or not hasattr(get_examples, "__call__"): if get_examples is None or not hasattr(get_examples, "__call__"):

View File

@ -58,7 +58,7 @@ class Sentencizer(Pipe):
else: else:
self.punct_chars = set(self.default_punct_chars) self.punct_chars = set(self.default_punct_chars)
def initialize(self, get_examples, pipeline=None): def initialize(self, get_examples, nlp=None):
pass pass
def __call__(self, doc): def __call__(self, doc):

View File

@ -124,16 +124,13 @@ class SentenceRecognizer(Tagger):
raise ValueError("nan value when computing loss") raise ValueError("nan value when computing loss")
return float(loss), d_scores return float(loss), d_scores
def initialize(self, get_examples, *, pipeline=None): def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
RETURNS: None
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
""" """

View File

@ -256,16 +256,13 @@ class Tagger(Pipe):
raise ValueError("nan value when computing loss") raise ValueError("nan value when computing loss")
return float(loss), d_scores return float(loss), d_scores
def initialize(self, get_examples, *, pipeline=None): def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.. returns a representative sample of gold-standard Example objects..
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/tagger#initialize DOCS: https://nightly.spacy.io/api/tagger#initialize
""" """

View File

@ -338,17 +338,14 @@ class TextCategorizer(Pipe):
self, self,
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None nlp: Optional[Language] = None,
) -> Optimizer: ):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
""" """

View File

@ -1,4 +1,4 @@
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
from thinc.api import Model, set_dropout_rate, Optimizer, Config from thinc.api import Model, set_dropout_rate, Optimizer, Config
from itertools import islice from itertools import islice
@ -207,20 +207,14 @@ class Tok2Vec(Pipe):
self, self,
get_examples: Callable[[], Iterable[Example]], get_examples: Callable[[], Iterable[Example]],
*, *,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, nlp: Optional[Language] = None,
sgd: Optional[Optimizer] = None,
): ):
"""Initialize the pipe for training, using a representative set """Initialize the pipe for training, using a representative set
of data examples. of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline nlp (Language): The current nlp object the component is part of.
components that this component is part of. Corresponds to
nlp.pipeline.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/tok2vec#initialize DOCS: https://nightly.spacy.io/api/tok2vec#initialize
""" """

View File

@ -405,7 +405,7 @@ cdef class Parser(Pipe):
def set_output(self, nO): def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO) self.model.attrs["resize_output"](self.model, nO)
def initialize(self, get_examples, pipeline=None, settings=None): def initialize(self, get_examples, nlp=None):
self._ensure_examples(get_examples) self._ensure_examples(get_examples)
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
@ -425,8 +425,8 @@ cdef class Parser(Pipe):
# make sure we resize so we have an appropriate upper layer # make sure we resize so we have an appropriate upper layer
self._resize() self._resize()
doc_sample = [] doc_sample = []
if pipeline is not None: if nlp is not None:
for name, component in pipeline: for name, component in nlp.pipeline:
if component is self: if component is self:
break break
if hasattr(component, "pipe"): if hasattr(component, "pipe"):
@ -438,8 +438,8 @@ cdef class Parser(Pipe):
doc_sample.append(example.predicted) doc_sample.append(example.predicted)
assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(doc_sample) self.model.initialize(doc_sample)
if pipeline is not None: if nlp is not None:
self.init_multitask_objectives(get_examples, pipeline) self.init_multitask_objectives(get_examples, nlp.pipeline)
def to_disk(self, path, exclude=tuple()): def to_disk(self, path, exclude=tuple()):
serializers = { serializers = {

View File

@ -107,7 +107,7 @@ def validate_init_settings(
*, *,
section: Optional[str] = None, section: Optional[str] = None,
name: str = "", name: str = "",
exclude: Iterable[str] = ("get_examples", "nlp", "pipeline"), exclude: Iterable[str] = ("get_examples", "nlp"),
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Validate initialization settings against the expected arguments in """Validate initialization settings against the expected arguments in
the method signature. Will parse values if possible (e.g. int to string) the method signature. Will parse values if possible (e.g. int to string)