mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Clean up sgd and pipeline -> nlp
This commit is contained in:
parent
612bbf85ab
commit
f171903139
|
@ -1,5 +1,5 @@
|
|||
from itertools import islice
|
||||
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
|
||||
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
import random
|
||||
|
@ -144,20 +144,14 @@ class EntityLinker(Pipe):
|
|||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
) -> Optimizer:
|
||||
nlp: Optional[Language] = None,
|
||||
):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
||||
"""
|
||||
|
@ -174,9 +168,6 @@ class EntityLinker(Pipe):
|
|||
self.model.initialize(
|
||||
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
||||
)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
def update(
|
||||
self,
|
||||
|
|
|
@ -129,16 +129,13 @@ class Morphologizer(Tagger):
|
|||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||
return 1
|
||||
|
||||
def initialize(self, get_examples, *, pipeline=None):
|
||||
def initialize(self, get_examples, *, nlp=None):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
|
||||
"""
|
||||
|
|
|
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
|
|||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def initialize(self, get_examples, pipeline=None):
|
||||
def initialize(self, get_examples, nlp=None):
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||
raise ValueError(err)
|
||||
|
@ -174,7 +174,7 @@ class ClozeMultitask(Pipe):
|
|||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def initialize(self, get_examples, pipeline=None):
|
||||
def initialize(self, get_examples, nlp=None):
|
||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||
self.model.output_layer.initialize(X)
|
||||
|
|
|
@ -183,7 +183,7 @@ cdef class Pipe:
|
|||
"""
|
||||
return util.create_default_optimizer()
|
||||
|
||||
def initialize(self, get_examples, *, pipeline=None):
|
||||
def initialize(self, get_examples, *, nlp=None):
|
||||
"""Initialize the pipe for training, using data examples if available.
|
||||
This method needs to be implemented by each Pipe component,
|
||||
ensuring the internal model (if available) is initialized properly
|
||||
|
@ -191,14 +191,11 @@ cdef class Pipe:
|
|||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
||||
"""
|
||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
||||
raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))
|
||||
|
||||
def _ensure_examples(self, get_examples):
|
||||
if get_examples is None or not hasattr(get_examples, "__call__"):
|
||||
|
|
|
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
|
|||
else:
|
||||
self.punct_chars = set(self.default_punct_chars)
|
||||
|
||||
def initialize(self, get_examples, pipeline=None):
|
||||
def initialize(self, get_examples, nlp=None):
|
||||
pass
|
||||
|
||||
def __call__(self, doc):
|
||||
|
|
|
@ -124,16 +124,13 @@ class SentenceRecognizer(Tagger):
|
|||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def initialize(self, get_examples, *, pipeline=None):
|
||||
def initialize(self, get_examples, *, nlp=None):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
RETURNS: None
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
|
||||
"""
|
||||
|
|
|
@ -256,16 +256,13 @@ class Tagger(Pipe):
|
|||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def initialize(self, get_examples, *, pipeline=None):
|
||||
def initialize(self, get_examples, *, nlp=None):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects..
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||
"""
|
||||
|
|
|
@ -338,17 +338,14 @@ class TextCategorizer(Pipe):
|
|||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None
|
||||
) -> Optimizer:
|
||||
nlp: Optional[Language] = None,
|
||||
):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||
"""
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
|
||||
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
|
||||
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
||||
from itertools import islice
|
||||
|
||||
|
@ -207,20 +207,14 @@ class Tok2Vec(Pipe):
|
|||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
nlp: Optional[Language] = None,
|
||||
):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
||||
components that this component is part of. Corresponds to
|
||||
nlp.pipeline.
|
||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
||||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
|
||||
"""
|
||||
|
|
|
@ -405,7 +405,7 @@ cdef class Parser(Pipe):
|
|||
def set_output(self, nO):
|
||||
self.model.attrs["resize_output"](self.model, nO)
|
||||
|
||||
def initialize(self, get_examples, pipeline=None, settings=None):
|
||||
def initialize(self, get_examples, nlp=None):
|
||||
self._ensure_examples(get_examples)
|
||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||
|
@ -425,8 +425,8 @@ cdef class Parser(Pipe):
|
|||
# make sure we resize so we have an appropriate upper layer
|
||||
self._resize()
|
||||
doc_sample = []
|
||||
if pipeline is not None:
|
||||
for name, component in pipeline:
|
||||
if nlp is not None:
|
||||
for name, component in nlp.pipeline:
|
||||
if component is self:
|
||||
break
|
||||
if hasattr(component, "pipe"):
|
||||
|
@ -438,8 +438,8 @@ cdef class Parser(Pipe):
|
|||
doc_sample.append(example.predicted)
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(doc_sample)
|
||||
if pipeline is not None:
|
||||
self.init_multitask_objectives(get_examples, pipeline)
|
||||
if nlp is not None:
|
||||
self.init_multitask_objectives(get_examples, nlp.pipeline)
|
||||
|
||||
def to_disk(self, path, exclude=tuple()):
|
||||
serializers = {
|
||||
|
|
|
@ -107,7 +107,7 @@ def validate_init_settings(
|
|||
*,
|
||||
section: Optional[str] = None,
|
||||
name: str = "",
|
||||
exclude: Iterable[str] = ("get_examples", "nlp", "pipeline"),
|
||||
exclude: Iterable[str] = ("get_examples", "nlp"),
|
||||
) -> Dict[str, Any]:
|
||||
"""Validate initialization settings against the expected arguments in
|
||||
the method signature. Will parse values if possible (e.g. int to string)
|
||||
|
|
Loading…
Reference in New Issue
Block a user