diff --git a/setup.py b/setup.py index 4a4b99f22..604d65745 100755 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ MOD_NAMES = [ "spacy.pipeline.multitask", "spacy.pipeline.ner", "spacy.pipeline.pipe", + "spacy.pipeline.trainable_pipe", "spacy.pipeline.sentencizer", "spacy.pipeline.senter", "spacy.pipeline.tagger", diff --git a/spacy/about.py b/spacy/about.py index 108689074..095d726a0 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a35" +__version__ = "3.0.0a36" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/errors.py b/spacy/errors.py index bf3628ce9..2bc2f3e20 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -522,14 +522,12 @@ class Errors: E928 = ("A KnowledgeBase can only be serialized to/from from a directory, " "but the provided argument {loc} points to a file.") E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.") - E930 = ("Received invalid get_examples callback in `{name}.initialize`. " + E930 = ("Received invalid get_examples callback in `{method}`. " "Expected function that returns an iterable of Example objects but " "got: {obj}") - E931 = ("Encountered Pipe subclass without `Pipe.{method}` method in component " - "'{name}'. If the component is trainable and you want to use this " - "method, make sure it's overwritten on the subclass. If your " - "component isn't trainable, add a method that does nothing or " - "don't use the Pipe base class.") + E931 = ("Encountered {parent} subclass without `{parent}.{method}` " + "method in component '{name}'. If you want to use this " + "method, make sure it's overwritten on the subclass.") E940 = ("Found NaN values in scores.") E941 = ("Can't find model '{name}'. It looks like you're trying to load a " "model from a shortcut, which is deprecated as of spaCy v3.0. To " diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 4a71b26a2..d61bd43fa 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -30,6 +30,7 @@ cdef class KnowledgeBase: cdef Pool mem cpdef readonly Vocab vocab cdef int64_t entity_vector_length + cdef public set _added_strings # This maps 64bit keys (hash of unique entity string) # to 64bit values (position of the _KBEntryC struct in the _entries vector). diff --git a/spacy/kb.pyx b/spacy/kb.pyx index bdf652766..478579d71 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,5 +1,7 @@ # cython: infer_types=True, profile=True -from typing import Iterator +from typing import Iterator, Iterable + +import srsly from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from cpython.exc cimport PyErr_SetFromErrno @@ -10,13 +12,10 @@ from libcpp.vector cimport vector from pathlib import Path import warnings -from spacy.strings import StringStore - -from spacy import util - from .typedefs cimport hash_t from .errors import Errors, Warnings - +from . import util +from .util import SimpleFrozenList, ensure_path cdef class Candidate: """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved @@ -85,9 +84,6 @@ cdef class KnowledgeBase: DOCS: https://nightly.spacy.io/api/kb """ - contents_loc = "contents" - strings_loc = "strings.json" - def __init__(self, Vocab vocab, entity_vector_length): """Create a KnowledgeBase.""" self.mem = Pool() @@ -95,8 +91,8 @@ cdef class KnowledgeBase: self._entry_index = PreshMap() self._alias_index = PreshMap() self.vocab = vocab - self.vocab.strings.add("") self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) + self._added_strings = set() @property def entity_vector_length(self): @@ -118,12 +114,16 @@ cdef class KnowledgeBase: def get_alias_strings(self): return [self.vocab.strings[x] for x in self._alias_index] + def add_string(self, string: str): + self._added_strings.add(string) + return self.vocab.strings.add(string) + def add_entity(self, unicode entity, float freq, vector[float] entity_vector): """ Add an entity to the KB, optionally specifying its log probability based on corpus frequency Return the hash of the entity ID/name at the end. """ - cdef hash_t entity_hash = self.vocab.strings.add(entity) + cdef hash_t entity_hash = self.add_string(entity) # Return if this entity was added before if entity_hash in self._entry_index: @@ -157,7 +157,7 @@ cdef class KnowledgeBase: cdef hash_t entity_hash while i < len(entity_list): # only process this entity if its unique ID hadn't been added before - entity_hash = self.vocab.strings.add(entity_list[i]) + entity_hash = self.add_string(entity_list[i]) if entity_hash in self._entry_index: warnings.warn(Warnings.W018.format(entity=entity_list[i])) @@ -203,7 +203,7 @@ cdef class KnowledgeBase: if prob_sum > 1.00001: raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum)) - cdef hash_t alias_hash = self.vocab.strings.add(alias) + cdef hash_t alias_hash = self.add_string(alias) # Check whether this alias was added before if alias_hash in self._alias_index: @@ -324,26 +324,27 @@ cdef class KnowledgeBase: return 0.0 - def to_disk(self, path): - path = util.ensure_path(path) + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + path = ensure_path(path) if not path.exists(): path.mkdir(parents=True) if not path.is_dir(): raise ValueError(Errors.E928.format(loc=path)) - self.write_contents(path / self.contents_loc) - self.vocab.strings.to_disk(path / self.strings_loc) + serialize = {} + serialize["contents"] = lambda p: self.write_contents(p) + serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings) + util.to_disk(path, serialize, exclude) - def from_disk(self, path): - path = util.ensure_path(path) + def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + path = ensure_path(path) if not path.exists(): raise ValueError(Errors.E929.format(loc=path)) if not path.is_dir(): raise ValueError(Errors.E928.format(loc=path)) - self.read_contents(path / self.contents_loc) - kb_strings = StringStore() - kb_strings.from_disk(path / self.strings_loc) - for string in kb_strings: - self.vocab.strings.add(string) + deserialize = {} + deserialize["contents"] = lambda p: self.read_contents(p) + deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)] + util.from_disk(path, deserialize, exclude) def write_contents(self, file_path): cdef Writer writer = Writer(file_path) diff --git a/spacy/language.py b/spacy/language.py index b438936a6..1fb559657 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,7 +20,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .training import Example, validate_examples from .training.initialize import init_vocab, init_tok2vec from .scorer import Scorer -from .util import registry, SimpleFrozenList +from .util import registry, SimpleFrozenList, _pipe from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES @@ -1095,7 +1095,7 @@ class Language: if ( name not in exclude and hasattr(proc, "is_trainable") - and proc.is_trainable() + and proc.is_trainable and proc.model not in (True, False, None) ): proc.finish_update(sgd) @@ -1194,8 +1194,8 @@ class Language: doc = Doc(self.vocab, words=["x", "y", "z"]) get_examples = lambda: [Example.from_dict(doc, {})] if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="Language", obj=type(get_examples)) - raise ValueError(err) + err = Errors.E930.format(method="Language.initialize", obj=type(get_examples)) + raise TypeError(err) # Make sure the config is interpolated so we can resolve subsections config = self.config.interpolate() # These are the settings provided in the [initialize] block in the config @@ -1301,16 +1301,7 @@ class Language: for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) - # non-trainable components may have a pipe() implementation that refers to dummy - # predict and set_annotations methods - if ( - not hasattr(pipe, "pipe") - or not hasattr(pipe, "is_trainable") - or not pipe.is_trainable() - ): - docs = _pipe(docs, pipe, kwargs) - else: - docs = pipe.pipe(docs, **kwargs) + docs = _pipe(docs, pipe, kwargs) # iterate over the final generator if len(self.pipeline): docs = list(docs) @@ -1417,17 +1408,7 @@ class Language: kwargs = component_cfg.get(name, {}) # Allow component_cfg to overwrite the top-level kwargs. kwargs.setdefault("batch_size", batch_size) - # non-trainable components may have a pipe() implementation that refers to dummy - # predict and set_annotations methods - if ( - hasattr(proc, "pipe") - and hasattr(proc, "is_trainable") - and proc.is_trainable() - ): - f = functools.partial(proc.pipe, **kwargs) - else: - # Apply the function, but yield the doc - f = functools.partial(_pipe, proc=proc, kwargs=kwargs) + f = functools.partial(_pipe, proc=proc, kwargs=kwargs) pipes.append(f) if n_process != 1: @@ -1826,19 +1807,6 @@ class DisabledPipes(list): self[:] = [] -def _pipe( - examples: Iterable[Example], proc: Callable[[Doc], Doc], kwargs: Dict[str, Any] -) -> Iterator[Example]: - # We added some args for pipe that __call__ doesn't expect. - kwargs = dict(kwargs) - for arg in ["batch_size"]: - if arg in kwargs: - kwargs.pop(arg) - for eg in examples: - eg = proc(eg, **kwargs) - yield eg - - def _apply_pipes( make_doc: Callable[[str], Doc], pipes: Iterable[Callable[[Doc], Doc]], diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 656182088..cec5b4eb5 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -6,6 +6,7 @@ from .entityruler import EntityRuler from .lemmatizer import Lemmatizer from .morphologizer import Morphologizer from .pipe import Pipe +from .trainable_pipe import TrainablePipe from .senter import SentenceRecognizer from .sentencizer import Sentencizer from .tagger import Tagger @@ -21,6 +22,7 @@ __all__ = [ "EntityRuler", "Morphologizer", "Lemmatizer", + "TrainablePipe", "Pipe", "SentenceRecognizer", "Sentencizer", diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 0ab1ac9bf..7a6a1de5b 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -57,6 +57,7 @@ class AttributeRuler(Pipe): self.attrs = [] self._attrs_unnormed = [] # store for reference self.indices = [] + self._added_strings = set() def clear(self) -> None: """Reset all patterns.""" @@ -123,21 +124,6 @@ class AttributeRuler(Pipe): set_token_attrs(span[index], attrs) return doc - def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: - """Apply the pipe to a stream of documents. This usually happens under - the hood when the nlp object is called on a text and all components are - applied to the Doc. - - stream (Iterable[Doc]): A stream of documents. - batch_size (int): The number of documents to buffer. - YIELDS (Doc): Processed documents in order. - - DOCS: https://spacy.io/attributeruler/pipe#pipe - """ - for doc in stream: - doc = self(doc) - yield doc - def load_from_tag_map( self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]] ) -> None: @@ -201,12 +187,16 @@ class AttributeRuler(Pipe): # We need to make a string here, because otherwise the ID we pass back # will be interpreted as the hash of a string, rather than an ordinal. key = str(len(self.attrs)) - self.matcher.add(self.vocab.strings.add(key), patterns) + self.matcher.add(self.add_string(key), patterns) self._attrs_unnormed.append(attrs) attrs = normalize_token_attrs(self.vocab, attrs) self.attrs.append(attrs) self.indices.append(index) + def add_string(self, string: str): + self._added_strings.add(string) + return self.vocab.strings.add(string) + def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None: """Add patterns from a list of pattern dicts with the keys as the arguments to AttributeRuler.add. @@ -266,8 +256,8 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes """ serialize = {} - serialize["vocab"] = self.vocab.to_bytes serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns) + serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings)) return util.to_bytes(serialize, exclude) def from_bytes( @@ -286,7 +276,7 @@ class AttributeRuler(Pipe): self.add_patterns(srsly.msgpack_loads(b)) deserialize = { - "vocab": lambda b: self.vocab.from_bytes(b), + "strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)], "patterns": load_patterns, } util.from_bytes(bytes_data, deserialize, exclude) @@ -303,7 +293,7 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/attributeruler#to_disk """ serialize = { - "vocab": lambda p: self.vocab.to_disk(p), + "strings.json": lambda p: srsly.write_json(p, self._added_strings), "patterns": lambda p: srsly.write_msgpack(p, self.patterns), } util.to_disk(path, serialize, exclude) @@ -324,7 +314,7 @@ class AttributeRuler(Pipe): self.add_patterns(srsly.read_msgpack(p)) deserialize = { - "vocab": lambda p: self.vocab.from_disk(p), + "strings.json": lambda p: [self.add_string(s) for s in srsly.read_json(p)], "patterns": load_patterns, } util.from_disk(path, deserialize, exclude) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index eec591995..881e98785 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -10,10 +10,11 @@ import warnings from ..kb import KnowledgeBase, Candidate from ..ml import empty_kb from ..tokens import Doc -from .pipe import Pipe, deserialize_config +from .pipe import deserialize_config +from .trainable_pipe import TrainablePipe from ..language import Language from ..vocab import Vocab -from ..training import Example, validate_examples +from ..training import Example, validate_examples, validate_get_examples from ..errors import Errors, Warnings from ..util import SimpleFrozenList from .. import util @@ -90,7 +91,7 @@ def make_entity_linker( ) -class EntityLinker(Pipe): +class EntityLinker(TrainablePipe): """Pipeline component for named entity linking. DOCS: https://nightly.spacy.io/api/entitylinker @@ -172,7 +173,7 @@ class EntityLinker(Pipe): DOCS: https://nightly.spacy.io/api/entitylinker#initialize """ - self._ensure_examples(get_examples) + validate_get_examples(get_examples, "EntityLinker.initialize") if kb_loader is not None: self.set_kb(kb_loader) self.validate_kb() @@ -453,7 +454,6 @@ class EntityLinker(Pipe): """ serialize = {} serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) - serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p) util.to_disk(path, serialize, exclude) @@ -477,11 +477,12 @@ class EntityLinker(Pipe): raise ValueError(Errors.E149) from None deserialize = {} - deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) deserialize["kb"] = lambda p: self.kb.from_disk(p) deserialize["model"] = load_model util.from_disk(path, deserialize, exclude) + for s in self.kb._added_strings: + self.vocab.strings.add(s) return self def rehearse(self, examples, *, sgd=None, losses=None, **config): diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index dfaddad74..382ca338d 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -342,12 +342,6 @@ class EntityRuler(Pipe): validate_examples(examples, "EntityRuler.score") return Scorer.score_spans(examples, "ents", **kwargs) - def predict(self, docs): - pass - - def set_annotations(self, docs, scores): - pass - def from_bytes( self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() ) -> "EntityRuler": diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 9be596868..7f5370753 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -281,7 +281,6 @@ class Lemmatizer(Pipe): DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk """ serialize = {} - serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["lookups"] = lambda p: self.lookups.to_disk(p) util.to_disk(path, serialize, exclude) @@ -297,7 +296,6 @@ class Lemmatizer(Pipe): DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk """ deserialize = {} - deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["lookups"] = lambda p: self.lookups.from_disk(p) util.from_disk(path, deserialize, exclude) self._validate_tables() @@ -312,7 +310,6 @@ class Lemmatizer(Pipe): DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes """ serialize = {} - serialize["vocab"] = self.vocab.to_bytes serialize["lookups"] = self.lookups.to_bytes return util.to_bytes(serialize, exclude) @@ -328,7 +325,6 @@ class Lemmatizer(Pipe): DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes """ deserialize = {} - deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) deserialize["lookups"] = lambda b: self.lookups.from_bytes(b) util.from_bytes(bytes_data, deserialize, exclude) self._validate_tables() diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 6d97b062f..a456b7a0f 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -16,7 +16,7 @@ from .pipe import deserialize_config from .tagger import Tagger from .. import util from ..scorer import Scorer -from ..training import validate_examples +from ..training import validate_examples, validate_get_examples default_model_config = """ @@ -95,6 +95,7 @@ class Morphologizer(Tagger): # add mappings for empty morph self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""] + self._added_strings = set() @property def labels(self): @@ -128,6 +129,7 @@ class Morphologizer(Tagger): label_dict.pop(self.POS_FEAT) # normalize morph string and add to morphology table norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)] + self.add_string(norm_morph) # add label mappings if norm_label not in self.cfg["labels_morph"]: self.cfg["labels_morph"][norm_label] = norm_morph @@ -144,7 +146,7 @@ class Morphologizer(Tagger): DOCS: https://nightly.spacy.io/api/morphologizer#initialize """ - self._ensure_examples(get_examples) + validate_get_examples(get_examples, "Morphologizer.initialize") if labels is not None: self.cfg["labels_morph"] = labels["morph"] self.cfg["labels_pos"] = labels["pos"] @@ -159,6 +161,7 @@ class Morphologizer(Tagger): if pos: morph_dict[self.POS_FEAT] = pos norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] + self.add_string(norm_label) # add label->morph and label->POS mappings if norm_label not in self.cfg["labels_morph"]: self.cfg["labels_morph"][norm_label] = morph @@ -176,6 +179,7 @@ class Morphologizer(Tagger): if pos: morph_dict[self.POS_FEAT] = pos norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] + self.add_string(norm_label) gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels]) doc_sample.append(example.x) label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) @@ -234,6 +238,7 @@ class Morphologizer(Tagger): if pos: label_dict[self.POS_FEAT] = pos label = self.vocab.strings[self.vocab.morphology.add(label_dict)] + self.add_string(label) eg_truths.append(label) truths.append(eg_truths) d_scores, loss = loss_func(scores, truths) diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index fa304b842..e1ea49849 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -6,7 +6,7 @@ from thinc.api import set_dropout_rate from ..tokens.doc cimport Doc -from .pipe import Pipe +from .trainable_pipe import TrainablePipe from .tagger import Tagger from ..training import validate_examples from ..language import Language @@ -164,7 +164,7 @@ class MultitaskObjective(Tagger): return "I-SENT" -class ClozeMultitask(Pipe): +class ClozeMultitask(TrainablePipe): def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model diff --git a/spacy/pipeline/pipe.pxd b/spacy/pipeline/pipe.pxd index bca94d528..bb97f79d0 100644 --- a/spacy/pipeline/pipe.pxd +++ b/spacy/pipeline/pipe.pxd @@ -1,5 +1,2 @@ cdef class Pipe: - cdef public object vocab - cdef public object model cdef public str name - cdef public object cfg diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 50e5108b9..afb59fdb3 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -1,38 +1,22 @@ # cython: infer_types=True, profile=True import warnings -from typing import Optional, Tuple +from typing import Optional, Tuple, Iterable, Iterator, Callable, Union, Dict import srsly -from thinc.api import set_dropout_rate, Model from ..tokens.doc cimport Doc -from ..training import validate_examples +from ..training import Example from ..errors import Errors, Warnings -from .. import util - +from ..language import Language cdef class Pipe: - """This class is a base class and not instantiated directly. Trainable - pipeline components like the EntityRecognizer or TextCategorizer inherit - from it and it defines the interface that components should follow to - function as trainable components in a spaCy pipeline. + """This class is a base class and not instantiated directly. It provides + an interface for pipeline components to implement. + Trainable pipeline components like the EntityRecognizer or TextCategorizer + should inherit from the subclass 'TrainablePipe'. DOCS: https://nightly.spacy.io/api/pipe """ - def __init__(self, vocab, model, name, **cfg): - """Initialize a pipeline component. - - vocab (Vocab): The shared vocabulary. - model (thinc.api.Model): The Thinc Model powering the pipeline component. - name (str): The component instance name. - **cfg: Additonal settings and config parameters. - - DOCS: https://nightly.spacy.io/api/pipe#init - """ - self.vocab = vocab - self.model = model - self.name = name - self.cfg = dict(cfg) @classmethod def __init_subclass__(cls, **kwargs): @@ -41,18 +25,7 @@ cdef class Pipe: if hasattr(cls, "begin_training"): warnings.warn(Warnings.W088.format(name=cls.__name__)) - @property - def labels(self) -> Optional[Tuple[str]]: - return [] - - @property - def label_data(self): - """Optional JSON-serializable data that would be sufficient to recreate - the label set if provided to the `pipe.initialize()` method. - """ - return None - - def __call__(self, Doc doc): + def __call__(self, Doc doc) -> Doc: """Apply the pipe to one document. The document is modified in place, and returned. This usually happens under the hood when the nlp object is called on a text and all components are applied to the Doc. @@ -62,11 +35,9 @@ cdef class Pipe: DOCS: https://nightly.spacy.io/api/pipe#call """ - scores = self.predict([doc]) - self.set_annotations([doc], scores) - return doc + raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name)) - def pipe(self, stream, *, batch_size=128): + def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are applied to the Doc. @@ -77,137 +48,17 @@ cdef class Pipe: DOCS: https://nightly.spacy.io/api/pipe#pipe """ - for docs in util.minibatch(stream, size=batch_size): - scores = self.predict(docs) - self.set_annotations(docs, scores) - yield from docs + for doc in stream: + doc = self(doc) + yield doc - def predict(self, docs): - """Apply the pipeline's model to a batch of docs, without modifying them. - Returns a single tensor for a batch of documents. - - docs (Iterable[Doc]): The documents to predict. - RETURNS: Vector representations for each token in the documents. - - DOCS: https://nightly.spacy.io/api/pipe#predict - """ - raise NotImplementedError(Errors.E931.format(method="predict", name=self.name)) - - def set_annotations(self, docs, scores): - """Modify a batch of documents, using pre-computed scores. - - docs (Iterable[Doc]): The documents to modify. - scores: The scores to assign. - - DOCS: https://nightly.spacy.io/api/pipe#set_annotations - """ - raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name)) - - def update(self, examples, *, drop=0.0, set_annotations=False, sgd=None, losses=None): - """Learn from a batch of documents and gold-standard information, - updating the pipe's model. Delegates to predict and get_loss. - - examples (Iterable[Example]): A batch of Example objects. - drop (float): The dropout rate. - set_annotations (bool): Whether or not to update the Example objects - with the predictions. - sgd (thinc.api.Optimizer): The optimizer. - losses (Dict[str, float]): Optional record of the loss during training. - Updated using the component name as the key. - RETURNS (Dict[str, float]): The updated losses dictionary. - - DOCS: https://nightly.spacy.io/api/pipe#update - """ - if losses is None: - losses = {} - if not hasattr(self, "model") or self.model in (None, True, False): - return losses - losses.setdefault(self.name, 0.0) - validate_examples(examples, "Pipe.update") - if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): - # Handle cases where there are no tokens in any docs. - return - set_dropout_rate(self.model, drop) - scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples]) - loss, d_scores = self.get_loss(examples, scores) - bp_scores(d_scores) - if sgd not in (None, False): - self.finish_update(sgd) - losses[self.name] += loss - if set_annotations: - docs = [eg.predicted for eg in examples] - self.set_annotations(docs, scores=scores) - return losses - - def rehearse(self, examples, *, sgd=None, losses=None, **config): - """Perform a "rehearsal" update from a batch of data. Rehearsal updates - teach the current model to make predictions similar to an initial model, - to try to address the "catastrophic forgetting" problem. This feature is - experimental. - - examples (Iterable[Example]): A batch of Example objects. - drop (float): The dropout rate. - sgd (thinc.api.Optimizer): The optimizer. - losses (Dict[str, float]): Optional record of the loss during training. - Updated using the component name as the key. - RETURNS (Dict[str, float]): The updated losses dictionary. - - DOCS: https://nightly.spacy.io/api/pipe#rehearse - """ - pass - - def get_loss(self, examples, scores): - """Find the loss and gradient of loss for the batch of documents and - their predicted scores. - - examples (Iterable[Examples]): The batch of examples. - scores: Scores representing the model's predictions. - RETURNS (Tuple[float, float]): The loss and the gradient. - - DOCS: https://nightly.spacy.io/api/pipe#get_loss - """ - raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name)) - - def add_label(self, label): - """Add an output label, to be predicted by the model. It's possible to - extend pretrained models with new labels, but care should be taken to - avoid the "catastrophic forgetting" problem. - - label (str): The label to add. - RETURNS (int): 0 if label is already present, otherwise 1. - - DOCS: https://nightly.spacy.io/api/pipe#add_label - """ - raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) - - - def _require_labels(self) -> None: - """Raise an error if the component's model has no labels defined.""" - if not self.labels or list(self.labels) == [""]: - raise ValueError(Errors.E143.format(name=self.name)) - - - def _allow_extra_label(self) -> None: - """Raise an error if the component can not add any more labels.""" - if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels): - if not self.is_resizable(): - raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))) - - - def create_optimizer(self): - """Create an optimizer for the pipeline component. - - RETURNS (thinc.api.Optimizer): The optimizer. - - DOCS: https://nightly.spacy.io/api/pipe#create_optimizer - """ - return util.create_default_optimizer() - - def initialize(self, get_examples, *, nlp=None): - """Initialize the pipe for training, using data examples if available. - This method needs to be implemented by each Pipe component, - ensuring the internal model (if available) is initialized properly - using the provided sample of Example objects. + def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None): + """Initialize the pipe. For non-trainable components, this method + is optional. For trainable components, which should inherit + from the subclass TrainablePipe, the provided data examples + should be used to ensure that the internal model is initialized + properly and all input/output dimensions throughout the network are + inferred. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. @@ -217,49 +68,7 @@ cdef class Pipe: """ pass - def _ensure_examples(self, get_examples): - if get_examples is None or not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name=self.name, obj=type(get_examples)) - raise ValueError(err) - if not get_examples(): - err = Errors.E930.format(name=self.name, obj=get_examples()) - raise ValueError(err) - - def is_resizable(self): - return hasattr(self, "model") and "resize_output" in self.model.attrs - - def is_trainable(self): - return hasattr(self, "model") and isinstance(self.model, Model) - - def set_output(self, nO): - if self.is_resizable(): - self.model.attrs["resize_output"](self.model, nO) - else: - raise NotImplementedError(Errors.E921) - - def use_params(self, params): - """Modify the pipe's model, to use the given parameter values. At the - end of the context, the original parameters are restored. - - params (dict): The parameter values to use in the model. - - DOCS: https://nightly.spacy.io/api/pipe#use_params - """ - with self.model.use_params(params): - yield - - def finish_update(self, sgd): - """Update parameters using the current parameter gradients. - The Optimizer instance contains the functionality to perform - the stochastic gradient descent. - - sgd (thinc.api.Optimizer): The optimizer. - - DOCS: https://nightly.spacy.io/api/pipe#finish_update - """ - self.model.finish_update(sgd) - - def score(self, examples, **kwargs): + def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Union[float, Dict[str, float]]]: """Score a batch of examples. examples (Iterable[Example]): The examples to score. @@ -269,81 +78,25 @@ cdef class Pipe: """ return {} - def to_bytes(self, *, exclude=tuple()): - """Serialize the pipe to a bytestring. + @property + def is_trainable(self) -> bool: + return False - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (bytes): The serialized object. + @property + def labels(self) -> Optional[Tuple[str]]: + return tuple() - DOCS: https://nightly.spacy.io/api/pipe#to_bytes + @property + def label_data(self): + """Optional JSON-serializable data that would be sufficient to recreate + the label set if provided to the `pipe.initialize()` method. """ - serialize = {} - serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - serialize["model"] = self.model.to_bytes - if hasattr(self, "vocab"): - serialize["vocab"] = self.vocab.to_bytes - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, *, exclude=tuple()): - """Load the pipe from a bytestring. - - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (Pipe): The loaded object. - - DOCS: https://nightly.spacy.io/api/pipe#from_bytes - """ - - def load_model(b): - try: - self.model.from_bytes(b) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize = {} - if hasattr(self, "vocab"): - deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) - deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) - deserialize["model"] = load_model - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk(self, path, *, exclude=tuple()): - """Serialize the pipe to disk. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - - DOCS: https://nightly.spacy.io/api/pipe#to_disk - """ - serialize = {} - serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) - serialize["vocab"] = lambda p: self.vocab.to_disk(p) - serialize["model"] = lambda p: self.model.to_disk(p) - util.to_disk(path, serialize, exclude) - - def from_disk(self, path, *, exclude=tuple()): - """Load the pipe from disk. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (Pipe): The loaded object. - - DOCS: https://nightly.spacy.io/api/pipe#from_disk - """ - - def load_model(p): - try: - self.model.from_bytes(p.open("rb").read()) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize = {} - deserialize["vocab"] = lambda p: self.vocab.from_disk(p) - deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) - deserialize["model"] = load_model - util.from_disk(path, deserialize, exclude) - return self + return None + def _require_labels(self) -> None: + """Raise an error if this component has no labels defined.""" + if not self.labels or list(self.labels) == [""]: + raise ValueError(Errors.E143.format(name=self.name)) def deserialize_config(path): if path.exists(): diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 13fcd15e2..7656b330c 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -58,9 +58,6 @@ class Sentencizer(Pipe): else: self.punct_chars = set(self.default_punct_chars) - def initialize(self, get_examples, nlp=None): - pass - def __call__(self, doc): """Apply the sentencizer to a Doc and set Token.is_sent_start. @@ -204,9 +201,3 @@ class Sentencizer(Pipe): cfg = srsly.read_json(path) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self - - def get_loss(self, examples, scores): - raise NotImplementedError - - def add_label(self, label): - raise NotImplementedError diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 8fb1e664f..8ea4ed1b3 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -6,12 +6,11 @@ from thinc.api import Model, SequenceCategoricalCrossentropy, Config from ..tokens.doc cimport Doc -from .pipe import deserialize_config from .tagger import Tagger from ..language import Language from ..errors import Errors from ..scorer import Scorer -from ..training import validate_examples +from ..training import validate_examples, validate_get_examples from .. import util @@ -62,6 +61,7 @@ class SentenceRecognizer(Tagger): self.name = name self._rehearsal_model = None self.cfg = {} + self._added_strings = set() @property def labels(self): @@ -138,7 +138,7 @@ class SentenceRecognizer(Tagger): DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize """ - self._ensure_examples(get_examples) + validate_get_examples(get_examples, "SentenceRecognizer.initialize") doc_sample = [] label_sample = [] assert self.labels, Errors.E924.format(name=self.name) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index dd10c5670..535b71270 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -11,13 +11,14 @@ from ..tokens.doc cimport Doc from ..morphology cimport Morphology from ..vocab cimport Vocab -from .pipe import Pipe, deserialize_config +from .trainable_pipe import TrainablePipe +from .pipe import deserialize_config from ..language import Language from ..attrs import POS, ID from ..parts_of_speech import X from ..errors import Errors, Warnings from ..scorer import Scorer -from ..training import validate_examples +from ..training import validate_examples, validate_get_examples from .. import util @@ -55,7 +56,7 @@ def make_tagger(nlp: Language, name: str, model: Model): return Tagger(nlp.vocab, model, name) -class Tagger(Pipe): +class Tagger(TrainablePipe): """Pipeline component for part-of-speech tagging. DOCS: https://nightly.spacy.io/api/tagger @@ -77,6 +78,7 @@ class Tagger(Pipe): self._rehearsal_model = None cfg = {"labels": labels or []} self.cfg = dict(sorted(cfg.items())) + self._added_strings = set() @property def labels(self): @@ -274,7 +276,7 @@ class Tagger(Pipe): DOCS: https://nightly.spacy.io/api/tagger#initialize """ - self._ensure_examples(get_examples) + validate_get_examples(get_examples, "Tagger.initialize") if labels is not None: for tag in labels: self.add_label(tag) @@ -311,7 +313,7 @@ class Tagger(Pipe): return 0 self._allow_extra_label() self.cfg["labels"].append(label) - self.vocab.strings.add(label) + self.add_string(label) return 1 def score(self, examples, **kwargs): @@ -325,79 +327,3 @@ class Tagger(Pipe): """ validate_examples(examples, "Tagger.score") return Scorer.score_token_attr(examples, "tag", **kwargs) - - def to_bytes(self, *, exclude=tuple()): - """Serialize the pipe to a bytestring. - - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (bytes): The serialized object. - - DOCS: https://nightly.spacy.io/api/tagger#to_bytes - """ - serialize = {} - serialize["model"] = self.model.to_bytes - serialize["vocab"] = self.vocab.to_bytes - serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, *, exclude=tuple()): - """Load the pipe from a bytestring. - - bytes_data (bytes): The serialized pipe. - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (Tagger): The loaded Tagger. - - DOCS: https://nightly.spacy.io/api/tagger#from_bytes - """ - def load_model(b): - try: - self.model.from_bytes(b) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize = { - "vocab": lambda b: self.vocab.from_bytes(b), - "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), - "model": lambda b: load_model(b), - } - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk(self, path, *, exclude=tuple()): - """Serialize the pipe to disk. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - - DOCS: https://nightly.spacy.io/api/tagger#to_disk - """ - serialize = { - "vocab": lambda p: self.vocab.to_disk(p), - "model": lambda p: self.model.to_disk(p), - "cfg": lambda p: srsly.write_json(p, self.cfg), - } - util.to_disk(path, serialize, exclude) - - def from_disk(self, path, *, exclude=tuple()): - """Load the pipe from disk. Modifies the object in place and returns it. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (Tagger): The modified Tagger object. - - DOCS: https://nightly.spacy.io/api/tagger#from_disk - """ - def load_model(p): - with p.open("rb") as file_: - try: - self.model.from_bytes(file_.read()) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize = { - "vocab": lambda p: self.vocab.from_disk(p), - "cfg": lambda p: self.cfg.update(deserialize_config(p)), - "model": load_model, - } - util.from_disk(path, deserialize, exclude) - return self diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index cc7a76288..e57954184 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -4,9 +4,9 @@ from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Conf from thinc.types import Floats2d import numpy -from .pipe import Pipe +from .trainable_pipe import TrainablePipe from ..language import Language -from ..training import Example, validate_examples +from ..training import Example, validate_examples, validate_get_examples from ..errors import Errors from ..scorer import Scorer from .. import util @@ -85,7 +85,7 @@ def make_textcat( return TextCategorizer(nlp.vocab, model, name, threshold=threshold) -class TextCategorizer(Pipe): +class TextCategorizer(TrainablePipe): """Pipeline component for text classification. DOCS: https://nightly.spacy.io/api/textcategorizer @@ -110,6 +110,7 @@ class TextCategorizer(Pipe): self._rehearsal_model = None cfg = {"labels": [], "threshold": threshold, "positive_label": None} self.cfg = dict(cfg) + self._added_strings = set() @property def labels(self) -> Tuple[str]: @@ -119,13 +120,6 @@ class TextCategorizer(Pipe): """ return tuple(self.cfg["labels"]) - @labels.setter - def labels(self, value: List[str]) -> None: - # TODO: This really shouldn't be here. I had a look and I added it when - # I added the labels property, but it's pretty nasty to have this, and - # will lead to problems. - self.cfg["labels"] = tuple(value) - @property def label_data(self) -> List[str]: """RETURNS (List[str]): Information about the component's labels.""" @@ -306,7 +300,8 @@ class TextCategorizer(Pipe): if label in self.labels: return 0 self._allow_extra_label() - self.labels = tuple(list(self.labels) + [label]) + self.cfg["labels"].append(label) + self.add_string(label) return 1 def initialize( @@ -329,7 +324,7 @@ class TextCategorizer(Pipe): DOCS: https://nightly.spacy.io/api/textcategorizer#initialize """ - self._ensure_examples(get_examples) + validate_get_examples(get_examples, "TextCategorizer.initialize") if labels is None: for example in get_examples(): for cat in example.y.cats: diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 0f309326e..b4625291b 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -2,8 +2,8 @@ from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List from thinc.api import Model, set_dropout_rate, Optimizer, Config from itertools import islice -from .pipe import Pipe -from ..training import Example, validate_examples +from .trainable_pipe import TrainablePipe +from ..training import Example, validate_examples, validate_get_examples from ..tokens import Doc from ..vocab import Vocab from ..language import Language @@ -32,7 +32,7 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec": return Tok2Vec(nlp.vocab, model, name) -class Tok2Vec(Pipe): +class Tok2Vec(TrainablePipe): """Apply a "token-to-vector" model and set its outputs in the doc.tensor attribute. This is mostly useful to share a single subnetwork between multiple components, e.g. to have one embedding and CNN network shared between a @@ -64,6 +64,7 @@ class Tok2Vec(Pipe): self.name = name self.listeners = [] self.cfg = {} + self._added_strings = set() def add_listener(self, listener: "Tok2VecListener") -> None: """Add a listener for a downstream component. Usually internals.""" @@ -218,7 +219,7 @@ class Tok2Vec(Pipe): DOCS: https://nightly.spacy.io/api/tok2vec#initialize """ - self._ensure_examples(get_examples) + validate_get_examples(get_examples, "Tok2Vec.initialize") doc_sample = [] for example in islice(get_examples(), 10): doc_sample.append(example.x) diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd new file mode 100644 index 000000000..8df5cb775 --- /dev/null +++ b/spacy/pipeline/trainable_pipe.pxd @@ -0,0 +1,8 @@ +from .pipe cimport Pipe +from ..vocab cimport Vocab + +cdef class TrainablePipe(Pipe): + cdef public Vocab vocab + cdef public object model + cdef public object cfg + cdef public set _added_strings diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx new file mode 100644 index 000000000..07a308953 --- /dev/null +++ b/spacy/pipeline/trainable_pipe.pyx @@ -0,0 +1,322 @@ +# cython: infer_types=True, profile=True +from typing import Iterable, Iterator, Optional, Dict, Tuple, Callable +import srsly +from thinc.api import set_dropout_rate, Model, Optimizer + +from ..tokens.doc cimport Doc + +from ..training import validate_examples +from ..errors import Errors +from .pipe import Pipe, deserialize_config +from .. import util +from ..vocab import Vocab +from ..language import Language +from ..training import Example + +cdef class TrainablePipe(Pipe): + """This class is a base class and not instantiated directly. Trainable + pipeline components like the EntityRecognizer or TextCategorizer inherit + from it and it defines the interface that components should follow to + function as trainable components in a spaCy pipeline. + + DOCS: https://nightly.spacy.io/api/pipe + """ + def __init__(self, vocab: Vocab, model: Model, name: str, **cfg): + """Initialize a pipeline component. + + vocab (Vocab): The shared vocabulary. + model (thinc.api.Model): The Thinc Model powering the pipeline component. + name (str): The component instance name. + **cfg: Additonal settings and config parameters. + + DOCS: https://nightly.spacy.io/api/pipe#init + """ + self.vocab = vocab + self.model = model + self.name = name + self.cfg = dict(cfg) + self._added_strings = set() + + def __call__(self, Doc doc) -> Doc: + """Apply the pipe to one document. The document is modified in place, + and returned. This usually happens under the hood when the nlp object + is called on a text and all components are applied to the Doc. + + docs (Doc): The Doc to process. + RETURNS (Doc): The processed Doc. + + DOCS: https://nightly.spacy.io/api/pipe#call + """ + scores = self.predict([doc]) + self.set_annotations([doc], scores) + return doc + + def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: + """Apply the pipe to a stream of documents. This usually happens under + the hood when the nlp object is called on a text and all components are + applied to the Doc. + + stream (Iterable[Doc]): A stream of documents. + batch_size (int): The number of documents to buffer. + YIELDS (Doc): Processed documents in order. + + DOCS: https://nightly.spacy.io/api/pipe#pipe + """ + for docs in util.minibatch(stream, size=batch_size): + scores = self.predict(docs) + self.set_annotations(docs, scores) + yield from docs + + def predict(self, docs: Iterable[Doc]): + """Apply the pipeline's model to a batch of docs, without modifying them. + Returns a single tensor for a batch of documents. + + docs (Iterable[Doc]): The documents to predict. + RETURNS: Vector representations of the predictions. + + DOCS: https://nightly.spacy.io/api/pipe#predict + """ + raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="predict", name=self.name)) + + def set_annotations(self, docs: Iterable[Doc], scores): + """Modify a batch of documents, using pre-computed scores. + + docs (Iterable[Doc]): The documents to modify. + scores: The scores to assign. + + DOCS: https://nightly.spacy.io/api/pipe#set_annotations + """ + raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="set_annotations", name=self.name)) + + def update(self, + examples: Iterable["Example"], + *, drop: float=0.0, + set_annotations: bool=False, + sgd: Optimizer=None, + losses: Optional[Dict[str, float]]=None) -> Dict[str, float]: + """Learn from a batch of documents and gold-standard information, + updating the pipe's model. Delegates to predict and get_loss. + + examples (Iterable[Example]): A batch of Example objects. + drop (float): The dropout rate. + set_annotations (bool): Whether or not to update the Example objects + with the predictions. + sgd (thinc.api.Optimizer): The optimizer. + losses (Dict[str, float]): Optional record of the loss during training. + Updated using the component name as the key. + RETURNS (Dict[str, float]): The updated losses dictionary. + + DOCS: https://nightly.spacy.io/api/pipe#update + """ + if losses is None: + losses = {} + if not hasattr(self, "model") or self.model in (None, True, False): + return losses + losses.setdefault(self.name, 0.0) + validate_examples(examples, "TrainablePipe.update") + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): + # Handle cases where there are no tokens in any docs. + return + set_dropout_rate(self.model, drop) + scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples]) + loss, d_scores = self.get_loss(examples, scores) + bp_scores(d_scores) + if sgd not in (None, False): + self.finish_update(sgd) + losses[self.name] += loss + if set_annotations: + docs = [eg.predicted for eg in examples] + self.set_annotations(docs, scores=scores) + return losses + + def rehearse(self, + examples: Iterable[Example], + *, + sgd: Optimizer=None, + losses: Dict[str, float]=None, + **config) -> Dict[str, float]: + """Perform a "rehearsal" update from a batch of data. Rehearsal updates + teach the current model to make predictions similar to an initial model, + to try to address the "catastrophic forgetting" problem. This feature is + experimental. + + examples (Iterable[Example]): A batch of Example objects. + sgd (thinc.api.Optimizer): The optimizer. + losses (Dict[str, float]): Optional record of the loss during training. + Updated using the component name as the key. + RETURNS (Dict[str, float]): The updated losses dictionary. + + DOCS: https://nightly.spacy.io/api/pipe#rehearse + """ + pass + + def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]: + """Find the loss and gradient of loss for the batch of documents and + their predicted scores. + + examples (Iterable[Examples]): The batch of examples. + scores: Scores representing the model's predictions. + RETURNS (Tuple[float, float]): The loss and the gradient. + + DOCS: https://nightly.spacy.io/api/pipe#get_loss + """ + raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name)) + + def create_optimizer(self) -> Optimizer: + """Create an optimizer for the pipeline component. + + RETURNS (thinc.api.Optimizer): The optimizer. + + DOCS: https://nightly.spacy.io/api/pipe#create_optimizer + """ + return util.create_default_optimizer() + + def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None): + """Initialize the pipe for training, using data examples if available. + This method needs to be implemented by each TrainablePipe component, + ensuring the internal model (if available) is initialized properly + using the provided sample of Example objects. + + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. + nlp (Language): The current nlp object the component is part of. + + DOCS: https://nightly.spacy.io/api/pipe#initialize + """ + raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="initialize", name=self.name)) + + def add_label(self, label: str) -> int: + """Add an output label. + For TrainablePipe components, it is possible to + extend pretrained models with new labels, but care should be taken to + avoid the "catastrophic forgetting" problem. + + label (str): The label to add. + RETURNS (int): 0 if label is already present, otherwise 1. + + DOCS: https://nightly.spacy.io/api/pipe#add_label + """ + raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name)) + + def add_string(self, string: str): + self._added_strings.add(string) + return self.vocab.strings.add(string) + + @property + def is_trainable(self) -> bool: + return True + + @property + def is_resizable(self) -> bool: + return getattr(self, "model", None) and "resize_output" in self.model.attrs + + def _allow_extra_label(self) -> None: + """Raise an error if the component can not add any more labels.""" + if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels): + if not self.is_resizable: + raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))) + + def set_output(self, nO: int) -> None: + if self.is_resizable: + self.model.attrs["resize_output"](self.model, nO) + else: + raise NotImplementedError(Errors.E921) + + def use_params(self, params: dict): + """Modify the pipe's model, to use the given parameter values. At the + end of the context, the original parameters are restored. + + params (dict): The parameter values to use in the model. + + DOCS: https://nightly.spacy.io/api/pipe#use_params + """ + with self.model.use_params(params): + yield + + def finish_update(self, sgd: Optimizer) -> None: + """Update parameters using the current parameter gradients. + The Optimizer instance contains the functionality to perform + the stochastic gradient descent. + + sgd (thinc.api.Optimizer): The optimizer. + + DOCS: https://nightly.spacy.io/api/pipe#finish_update + """ + self.model.finish_update(sgd) + + def to_bytes(self, *, exclude=tuple()): + """Serialize the pipe to a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. + + DOCS: https://nightly.spacy.io/api/pipe#to_bytes + """ + serialize = {} + if hasattr(self, "cfg"): + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + serialize["model"] = self.model.to_bytes + serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings)) + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, *, exclude=tuple()): + """Load the pipe from a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (TrainablePipe): The loaded object. + + DOCS: https://nightly.spacy.io/api/pipe#from_bytes + """ + + def load_model(b): + try: + self.model.from_bytes(b) + except AttributeError: + raise ValueError(Errors.E149) from None + + deserialize = {} + deserialize["strings.json"] = lambda b: [self.add_string(s) for s in srsly.json_loads(b)] + if hasattr(self, "cfg"): + deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) + deserialize["model"] = load_model + util.from_bytes(bytes_data, deserialize, exclude) + return self + + def to_disk(self, path, *, exclude=tuple()): + """Serialize the pipe to disk. + + path (str / Path): Path to a directory. + exclude (Iterable[str]): String names of serialization fields to exclude. + + DOCS: https://nightly.spacy.io/api/pipe#to_disk + """ + serialize = {} + if hasattr(self, "cfg"): + serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) + serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings) + serialize["model"] = lambda p: self.model.to_disk(p) + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, *, exclude=tuple()): + """Load the pipe from disk. + + path (str / Path): Path to a directory. + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (TrainablePipe): The loaded object. + + DOCS: https://nightly.spacy.io/api/pipe#from_disk + """ + + def load_model(p): + try: + self.model.from_bytes(p.open("rb").read()) + except AttributeError: + raise ValueError(Errors.E149) from None + + deserialize = {} + deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)] + if hasattr(self, "cfg"): + deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) + deserialize["model"] = load_model + util.from_disk(path, deserialize, exclude) + return self diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd index 67bc01f97..bd5bad334 100644 --- a/spacy/pipeline/transition_parser.pxd +++ b/spacy/pipeline/transition_parser.pxd @@ -1,13 +1,13 @@ from cymem.cymem cimport Pool from ..vocab cimport Vocab -from .pipe cimport Pipe +from .trainable_pipe cimport TrainablePipe from ._parser_internals.transition_system cimport Transition, TransitionSystem from ._parser_internals._state cimport StateC from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC -cdef class Parser(Pipe): +cdef class Parser(TrainablePipe): cdef public object _rehearsal_model cdef readonly TransitionSystem moves cdef public object _multitasks diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 3b4406757..3743e1018 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -21,13 +21,14 @@ from ..ml.parser_model cimport predict_states, arg_max_if_valid from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ..ml.parser_model cimport get_c_weights, get_c_sizes from ..tokens.doc cimport Doc +from .trainable_pipe import TrainablePipe -from ..training import validate_examples +from ..training import validate_examples, validate_get_examples from ..errors import Errors, Warnings from .. import util -cdef class Parser(Pipe): +cdef class Parser(TrainablePipe): """ Base class of the DependencyParser and EntityRecognizer. """ @@ -75,6 +76,7 @@ cdef class Parser(Pipe): self.add_multitask_objective(multitask) self._rehearsal_model = None + self._added_strings = set() def __getnewargs_ex__(self): """This allows pickling the Parser and its keyword-only init arguments""" @@ -118,6 +120,7 @@ cdef class Parser(Pipe): resized = True if resized: self._resize() + self.add_string(label) return 1 return 0 @@ -411,7 +414,7 @@ cdef class Parser(Pipe): self.model.attrs["resize_output"](self.model, nO) def initialize(self, get_examples, nlp=None, labels=None): - self._ensure_examples(get_examples) + validate_get_examples(get_examples, "Parser.initialize") lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: langs = ", ".join(util.LEXEME_NORM_LANGS) @@ -439,7 +442,7 @@ cdef class Parser(Pipe): break # non-trainable components may have a pipe() implementation that refers to dummy # predict and set_annotations methods - if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable(): + if hasattr(component, "pipe"): doc_sample = list(component.pipe(doc_sample, batch_size=8)) else: doc_sample = [component(doc) for doc in doc_sample] @@ -454,7 +457,7 @@ cdef class Parser(Pipe): def to_disk(self, path, exclude=tuple()): serializers = { 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), - 'vocab': lambda p: self.vocab.to_disk(p), + 'strings.json': lambda p: srsly.write_json(p, self._added_strings), 'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]), 'cfg': lambda p: srsly.write_json(p, self.cfg) } @@ -462,7 +465,7 @@ cdef class Parser(Pipe): def from_disk(self, path, exclude=tuple()): deserializers = { - 'vocab': lambda p: self.vocab.from_disk(p), + 'strings.json': lambda p: [self.add_string(s) for s in srsly.read_json(p)], 'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]), 'cfg': lambda p: self.cfg.update(srsly.read_json(p)), 'model': lambda p: None, @@ -482,7 +485,7 @@ cdef class Parser(Pipe): def to_bytes(self, exclude=tuple()): serializers = { "model": lambda: (self.model.to_bytes()), - "vocab": lambda: self.vocab.to_bytes(), + "strings.json": lambda: srsly.json_dumps(sorted(self._added_strings)), "moves": lambda: self.moves.to_bytes(exclude=["strings"]), "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) } @@ -490,7 +493,7 @@ cdef class Parser(Pipe): def from_bytes(self, bytes_data, exclude=tuple()): deserializers = { - "vocab": lambda b: self.vocab.from_bytes(b), + "strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)], "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: None, diff --git a/spacy/schemas.py b/spacy/schemas.py index dc7a86b06..07d17d193 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -368,7 +368,7 @@ class ConfigSchemaInit(BaseModel): vectors: Optional[StrictStr] = Field(..., title="Path to vectors") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") - components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component") + components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") # fmt: on class Config: diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index e77be74ad..71496327b 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -133,7 +133,7 @@ def test_kb_custom_length(nlp): def test_kb_initialize_empty(nlp): """Test that the EL can't initialize without examples""" entity_linker = nlp.add_pipe("entity_linker") - with pytest.raises(ValueError): + with pytest.raises(TypeError): entity_linker.initialize(lambda: []) @@ -153,6 +153,23 @@ def test_kb_serialize(nlp): mykb.from_disk(d / "unknown" / "kb") +def test_kb_serialize_vocab(nlp): + """Test serialization of the KB and custom strings""" + entity = "MyFunnyID" + assert entity not in nlp.vocab.strings + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + assert not mykb.contains_entity(entity) + mykb.add_entity(entity, freq=342, entity_vector=[3]) + assert mykb.contains_entity(entity) + assert entity in mykb.vocab.strings + with make_tempdir() as d: + # normal read-write behaviour + mykb.to_disk(d / "kb") + mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1) + mykb_new.from_disk(d / "kb") + assert entity in mykb_new.vocab.strings + + def test_candidate_generation(nlp): """Test correct candidate generation""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) @@ -413,6 +430,7 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() vector_length = 3 + assert "Q2146908" not in nlp.vocab.strings # Convert the texts to docs to make sure we have doc.ents set for the training examples train_examples = [] @@ -440,6 +458,9 @@ def test_overfitting_IO(): last=True, ) entity_linker.set_kb(create_kb) + assert "Q2146908" in entity_linker.vocab.strings + assert "Q2146908" in entity_linker.kb.vocab.strings + assert "Q2146908" in entity_linker.kb._added_strings # train the NEL pipe optimizer = nlp.initialize(get_examples=lambda: train_examples) @@ -474,6 +495,10 @@ def test_overfitting_IO(): nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) assert nlp2.pipe_names == nlp.pipe_names + assert "Q2146908" in nlp2.vocab.strings + entity_linker2 = nlp2.get_pipe("entity_linker") + assert "Q2146908" in entity_linker2.vocab.strings + assert "Q2146908" in entity_linker2.kb.vocab.strings predictions = [] for text, annotation in TRAIN_DATA: doc2 = nlp2(text) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index af81129c0..ce9c0fa54 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -66,9 +66,9 @@ def test_initialize_examples(): # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=lambda: train_examples) - with pytest.raises(ValueError): + with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) - with pytest.raises(ValueError): + with pytest.raises(TypeError): nlp.initialize(get_examples=train_examples) @@ -101,3 +101,4 @@ def test_overfitting_IO(): doc2 = nlp2(test_text) assert [str(t.morph) for t in doc2] == gold_morphs assert [t.pos_ for t in doc2] == gold_pos_tags + assert nlp.get_pipe("morphologizer")._added_strings == nlp2.get_pipe("morphologizer")._added_strings diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 4b96992e1..c693a7487 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,6 +1,6 @@ import pytest from spacy.language import Language -from spacy.pipeline import Pipe +from spacy.pipeline import TrainablePipe from spacy.util import SimpleFrozenList, get_arg_names @@ -376,7 +376,7 @@ def test_pipe_label_data_no_labels(pipe): def test_warning_pipe_begin_training(): with pytest.warns(UserWarning, match="begin_training"): - class IncompatPipe(Pipe): + class IncompatPipe(TrainablePipe): def __init__(self): ... diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index c64dfcbd6..472216512 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -40,9 +40,9 @@ def test_initialize_examples(): # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=lambda: train_examples) - with pytest.raises(ValueError): + with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) - with pytest.raises(ValueError): + with pytest.raises(TypeError): nlp.initialize(get_examples=train_examples) @@ -80,3 +80,4 @@ def test_overfitting_IO(): nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts + assert nlp.get_pipe("senter")._added_strings == nlp2.get_pipe("senter")._added_strings diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index b32925d84..590c22233 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -74,13 +74,13 @@ def test_initialize_examples(): # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=lambda: train_examples) - with pytest.raises(ValueError): + with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: train_examples[0]) - with pytest.raises(ValueError): + with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: []) - with pytest.raises(ValueError): + with pytest.raises(TypeError): nlp.initialize(get_examples=train_examples) @@ -98,6 +98,7 @@ def test_overfitting_IO(): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["tagger"] < 0.00001 + assert tagger._added_strings == {"J", "N", "V"} # test the trained model test_text = "I like blue eggs" @@ -116,6 +117,7 @@ def test_overfitting_IO(): assert doc2[1].tag_ is "V" assert doc2[2].tag_ is "J" assert doc2[3].tag_ is "N" + assert nlp2.get_pipe("tagger")._added_strings == {"J", "N", "V"} def test_tagger_requires_labels(): diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index e950c81c6..7eb7ff658 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -127,9 +127,9 @@ def test_initialize_examples(): nlp.initialize() get_examples = make_get_examples(nlp) nlp.initialize(get_examples=get_examples) - with pytest.raises(ValueError): + with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) - with pytest.raises(ValueError): + with pytest.raises(TypeError): nlp.initialize(get_examples=get_examples()) @@ -146,6 +146,7 @@ def test_overfitting_IO(): train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.get_dim("nO") == 2 + assert textcat._added_strings == {"NEGATIVE", "POSITIVE"} for i in range(50): losses = {} @@ -167,6 +168,7 @@ def test_overfitting_IO(): cats2 = doc2.cats assert cats2["POSITIVE"] > 0.9 assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001) + assert nlp2.get_pipe("textcat")._added_strings == {"NEGATIVE", "POSITIVE"} # Test scoring scores = nlp.evaluate(train_examples) diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 0e2579ac4..73aea5b4b 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -1,5 +1,5 @@ import pytest -from spacy.pipeline import Pipe +from spacy.pipeline import TrainablePipe from spacy.matcher import PhraseMatcher, Matcher from spacy.tokens import Doc, Span, DocBin from spacy.training import Example, Corpus @@ -271,7 +271,7 @@ def test_issue4272(): def test_multiple_predictions(): - class DummyPipe(Pipe): + class DummyPipe(TrainablePipe): def __init__(self): self.model = "dummy_model" diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 9fda413a3..02d0c70dd 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,4 +1,3 @@ -from typing import Callable import warnings from unittest import TestCase import pytest @@ -7,8 +6,7 @@ from numpy import zeros from spacy.kb import KnowledgeBase, Writer from spacy.vectors import Vectors from spacy.language import Language -from spacy.pipeline import Pipe -from spacy.util import registry +from spacy.pipeline import TrainablePipe from ..util import make_tempdir @@ -45,14 +43,13 @@ def custom_pipe(): def from_disk(self, path, exclude=tuple(), **kwargs): return self - class MyPipe(Pipe): + class MyPipe(TrainablePipe): def __init__(self, vocab, model=True, **cfg): if cfg: self.cfg = cfg else: self.cfg = None self.model = SerializableDummy() - self.vocab = SerializableDummy() return MyPipe(None) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index f90531dbb..dfd7f6bd4 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,5 +1,6 @@ import pytest -from spacy import registry +import srsly +from spacy import registry, Vocab from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import TextCategorizer, SentenceRecognizer from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL @@ -69,6 +70,29 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): assert bytes_2 == bytes_3 +@pytest.mark.parametrize("Parser", test_parsers) +def test_serialize_parser_strings(Parser): + vocab1 = Vocab() + label = "FunnyLabel" + assert label not in vocab1.strings + config = { + "learn_tokens": False, + "min_action_freq": 0, + "update_with_oracle_cut_size": 100, + } + cfg = {"model": DEFAULT_PARSER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + parser1 = Parser(vocab1, model, **config) + parser1.add_label(label) + assert label in parser1.vocab.strings + vocab2 = Vocab() + assert label not in vocab2.strings + parser2 = Parser(vocab2, model, **config) + parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"])) + assert parser1._added_strings == parser2._added_strings == {"FunnyLabel"} + assert label in parser2.vocab.strings + + @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_disk(en_vocab, Parser): config = { @@ -132,6 +156,29 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): assert tagger1_d.to_bytes() == tagger2_d.to_bytes() +def test_serialize_tagger_strings(en_vocab, de_vocab, taggers): + label = "SomeWeirdLabel" + assert label not in en_vocab.strings + assert label not in de_vocab.strings + tagger = taggers[0] + assert label not in tagger.vocab.strings + with make_tempdir() as d: + # check that custom labels are serialized as part of the component's strings.jsonl + tagger.add_label(label) + assert label in tagger.vocab.strings + assert tagger._added_strings == {label} + file_path = d / "tagger1" + tagger.to_disk(file_path) + strings = srsly.read_json(file_path / "strings.json") + assert strings == ["SomeWeirdLabel"] + # ensure that the custom strings are loaded back in when using the tagger in another pipeline + cfg = {"model": DEFAULT_TAGGER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + tagger2 = Tagger(de_vocab, model).from_disk(file_path) + assert label in tagger2.vocab.strings + assert tagger2._added_strings == {label} + + def test_serialize_textcat_empty(en_vocab): # See issue #1105 cfg = {"model": DEFAULT_TEXTCAT_MODEL} diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index f71a5f521..86341dd9a 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -1,5 +1,5 @@ from .corpus import Corpus # noqa: F401 -from .example import Example, validate_examples # noqa: F401 +from .example import Example, validate_examples, validate_get_examples # noqa: F401 from .align import Alignment # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 1f3a36b33..a8da49c61 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -44,6 +44,24 @@ def validate_examples(examples, method): raise TypeError(err) +def validate_get_examples(get_examples, method): + """Check that a generator of a batch of examples received during processing is valid: + the callable produces a non-empty list of Example objects. + This function lives here to prevent circular imports. + + get_examples (Callable[[], Iterable[Example]]): A function that produces a batch of examples. + method (str): The method name to show in error messages. + """ + if get_examples is None or not hasattr(get_examples, "__call__"): + err = Errors.E930.format(method=method, obj=type(get_examples)) + raise TypeError(err) + examples = get_examples() + if not examples: + err = Errors.E930.format(method=method, obj=examples) + raise TypeError(err) + validate_examples(examples, method) + + cdef class Example: def __init__(self, Doc predicted, Doc reference, *, alignment=None): if predicted is None: diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 3a133a0df..b431ecf06 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -21,7 +21,7 @@ def console_logger(progress_bar: bool = False): logged_pipes = [ name for name, proc in nlp.pipeline - if hasattr(proc, "is_trainable") and proc.is_trainable() + if hasattr(proc, "is_trainable") and proc.is_trainable ] eval_frequency = nlp.config["training"]["eval_frequency"] score_weights = nlp.config["training"]["score_weights"] diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 48cf582e6..242113cc6 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -188,7 +188,7 @@ def train_while_improving( if ( name not in exclude and hasattr(proc, "is_trainable") - and proc.is_trainable() + and proc.is_trainable and proc.model not in (True, False, None) ): proc.finish_update(optimizer) diff --git a/spacy/util.py b/spacy/util.py index aa321b22f..bf4ea0c92 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1356,3 +1356,16 @@ def check_bool_env_var(env_var: str) -> bool: if value == "0": return False return bool(value) + + +def _pipe(docs, proc, kwargs): + if hasattr(proc, "pipe"): + yield from proc.pipe(docs, **kwargs) + # We added some args for pipe that __call__ doesn't expect. + kwargs = dict(kwargs) + for arg in ["batch_size"]: + if arg in kwargs: + kwargs.pop(arg) + for doc in docs: + doc = proc(doc, **kwargs) + yield doc diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index b98768dcf..e7adcdd75 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -1,5 +1,5 @@ --- -title: Pipe +title: TrainablePipe tag: class teaser: Base class for trainable pipeline components --- @@ -10,30 +10,32 @@ components like the [`EntityRecognizer`](/api/entityrecognizer) or interface that components should follow to function as trainable components in a spaCy pipeline. See the docs on [writing trainable components](/usage/processing-pipelines#trainable-components) -for how to use the `Pipe` base class to implement custom components. +for how to use the `TrainablePipe` base class to implement custom components. -> #### Why is Pipe implemented in Cython? + + +> #### Why is TrainablePipe implemented in Cython? > -> The `Pipe` class is implemented in a `.pyx` module, the extension used by -> [Cython](/api/cython). This is needed so that **other** Cython classes, like -> the [`EntityRecognizer`](/api/entityrecognizer) can inherit from it. But it -> doesn't mean you have to implement trainable components in Cython – pure -> Python components like the [`TextCategorizer`](/api/textcategorizer) can also -> inherit from `Pipe`. +> The `TrainablePipe` class is implemented in a `.pyx` module, the extension +> used by [Cython](/api/cython). This is needed so that **other** Cython +> classes, like the [`EntityRecognizer`](/api/entityrecognizer) can inherit from +> it. But it doesn't mean you have to implement trainable components in Cython – +> pure Python components like the [`TextCategorizer`](/api/textcategorizer) can +> also inherit from `TrainablePipe`. ```python -%%GITHUB_SPACY/spacy/pipeline/pipe.pyx +%%GITHUB_SPACY/spacy/pipeline/trainable_pipe.pyx ``` -## Pipe.\_\_init\_\_ {#init tag="method"} +## TrainablePipe.\_\_init\_\_ {#init tag="method"} > #### Example > > ```python -> from spacy.pipeline import Pipe +> from spacy.pipeline import TrainablePipe > from spacy.language import Language > -> class CustomPipe(Pipe): +> class CustomPipe(TrainablePipe): > ... > > @Language.factory("your_custom_pipe", default_config={"model": MODEL}) @@ -45,14 +47,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| ------- | ------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], Any]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| `**cfg` | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. | +| Name | Description | +| ------- | -------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], Any]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| `**cfg` | Additional config parameters and settings. Will be available as the dictionary `cfg` and is serialized with the component. | -## Pipe.\_\_call\_\_ {#call tag="method"} +## TrainablePipe.\_\_call\_\_ {#call tag="method"} Apply the pipe to one document. The document is modified in place, and returned. This usually happens under the hood when the `nlp` object is called on a text @@ -75,7 +77,7 @@ and all pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | -## Pipe.pipe {#pipe tag="method"} +## TrainablePipe.pipe {#pipe tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood when the `nlp` object is called on a text and all pipeline components are @@ -98,7 +100,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Pipe.initialize {#initialize tag="method" new="3"} +## TrainablePipe.initialize {#initialize tag="method" new="3"} Initialize the component for training. `get_examples` should be a function that returns an iterable of [`Example`](/api/example) objects. The data examples are @@ -128,7 +130,7 @@ This method was previously called `begin_training`. | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -## Pipe.predict {#predict tag="method"} +## TrainablePipe.predict {#predict tag="method"} Apply the component's model to a batch of [`Doc`](/api/doc) objects, without modifying them. @@ -151,7 +153,7 @@ This method needs to be overwritten with your own custom `predict` method. | `docs` | The documents to predict. ~~Iterable[Doc]~~ | | **RETURNS** | The model's prediction for each document. | -## Pipe.set_annotations {#set_annotations tag="method"} +## TrainablePipe.set_annotations {#set_annotations tag="method"} Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. @@ -175,7 +177,7 @@ method. | `docs` | The documents to modify. ~~Iterable[Doc]~~ | | `scores` | The scores to set, produced by `Tagger.predict`. | -## Pipe.update {#update tag="method"} +## TrainablePipe.update {#update tag="method"} Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. @@ -198,7 +200,7 @@ predictions and gold-standard annotations, and update the component's model. | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | -## Pipe.rehearse {#rehearse tag="method,experimental" new="3"} +## TrainablePipe.rehearse {#rehearse tag="method,experimental" new="3"} Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the current model to make predictions similar to an initial model, to try to address @@ -216,12 +218,11 @@ the "catastrophic forgetting" problem. This feature is experimental. | -------------- | ------------------------------------------------------------------------------------------------------------------------ | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | | _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | -## Pipe.get_loss {#get_loss tag="method"} +## TrainablePipe.get_loss {#get_loss tag="method"} Find the loss and gradient of loss for the batch of documents and their predicted scores. @@ -246,7 +247,7 @@ This method needs to be overwritten with your own custom `get_loss` method. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## Pipe.score {#score tag="method" new="3"} +## TrainablePipe.score {#score tag="method" new="3"} Score a batch of examples. @@ -261,7 +262,7 @@ Score a batch of examples. | `examples` | The examples to score. ~~Iterable[Example]~~ | | **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ | -## Pipe.create_optimizer {#create_optimizer tag="method"} +## TrainablePipe.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam) with default settings. @@ -277,7 +278,7 @@ Create an optimizer for the pipeline component. Defaults to | ----------- | ---------------------------- | | **RETURNS** | The optimizer. ~~Optimizer~~ | -## Pipe.use_params {#use_params tag="method, contextmanager"} +## TrainablePipe.use_params {#use_params tag="method, contextmanager"} Modify the pipe's model, to use the given parameter values. At the end of the context, the original parameters are restored. @@ -294,7 +295,7 @@ context, the original parameters are restored. | -------- | -------------------------------------------------- | | `params` | The parameter values to use in the model. ~~dict~~ | -## Pipe.finish_update {#finish_update tag="method"} +## TrainablePipe.finish_update {#finish_update tag="method"} Update parameters using the current parameter gradients. Defaults to calling [`self.model.finish_update`](https://thinc.ai/docs/api-model#finish_update). @@ -312,7 +313,7 @@ Update parameters using the current parameter gradients. Defaults to calling | ----- | ------------------------------------- | | `sgd` | An optimizer. ~~Optional[Optimizer]~~ | -## Pipe.add_label {#add_label tag="method"} +## TrainablePipe.add_label {#add_label tag="method"} > #### Example > @@ -347,12 +348,12 @@ case, all labels found in the sample will be automatically added to the model, and the output dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference) automatically. -## Pipe.is_resizable {#is_resizable tag="method"} +## TrainablePipe.is_resizable {#is_resizable tag="property"} > #### Example > > ```python -> can_resize = pipe.is_resizable() +> can_resize = pipe.is_resizable > ``` > > With custom resizing implemented by a component: @@ -378,7 +379,7 @@ as an attribute to the component's model. | ----------- | ---------------------------------------------------------------------------------------------- | | **RETURNS** | Whether or not the output dimension of the model can be changed after initialization. ~~bool~~ | -## Pipe.set_output {#set_output tag="method"} +## TrainablePipe.set_output {#set_output tag="method"} Change the output dimension of the component's model. If the component is not [resizable](#is_resizable), this method will raise a `NotImplementedError`. If a @@ -390,7 +391,7 @@ care should be taken to avoid the "catastrophic forgetting" problem. > #### Example > > ```python -> if pipe.is_resizable(): +> if pipe.is_resizable: > pipe.set_output(512) > ``` @@ -398,7 +399,7 @@ care should be taken to avoid the "catastrophic forgetting" problem. | ---- | --------------------------------- | | `nO` | The new output dimension. ~~int~~ | -## Pipe.to_disk {#to_disk tag="method"} +## TrainablePipe.to_disk {#to_disk tag="method"} Serialize the pipe to disk. @@ -415,7 +416,7 @@ Serialize the pipe to disk. | _keyword-only_ | | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | -## Pipe.from_disk {#from_disk tag="method"} +## TrainablePipe.from_disk {#from_disk tag="method"} Load the pipe from disk. Modifies the object in place and returns it. @@ -431,9 +432,9 @@ Load the pipe from disk. Modifies the object in place and returns it. | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | | _keyword-only_ | | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | -| **RETURNS** | The modified pipe. ~~Pipe~~ | +| **RETURNS** | The modified pipe. ~~TrainablePipe~~ | -## Pipe.to_bytes {#to_bytes tag="method"} +## TrainablePipe.to_bytes {#to_bytes tag="method"} > #### Example > @@ -450,7 +451,7 @@ Serialize the pipe to a bytestring. | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | | **RETURNS** | The serialized form of the pipe. ~~bytes~~ | -## Pipe.from_bytes {#from_bytes tag="method"} +## TrainablePipe.from_bytes {#from_bytes tag="method"} Load the pipe from a bytestring. Modifies the object in place and returns it. @@ -467,16 +468,16 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. | `bytes_data` | The data to load from. ~~bytes~~ | | _keyword-only_ | | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | -| **RETURNS** | The pipe. ~~Pipe~~ | +| **RETURNS** | The pipe. ~~TrainablePipe~~ | ## Attributes {#attributes} -| Name | Description | -| ------- | ------------------------------------------------------------------------------------------------------------------------ | -| `vocab` | The shared vocabulary that's passed in on initialization. ~~Vocab~~ | -| `model` | The model powering the component. ~~Model[List[Doc], Any]~~ | -| `name` | The name of the component instance in the pipeline. Can be used in the losses. ~~str~~ | -| `cfg` | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. ~~Dict[str, Any]~~ | +| Name | Description | +| ------- | --------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary that's passed in on initialization. ~~Vocab~~ | +| `model` | The model powering the component. ~~Model[List[Doc], Any]~~ | +| `name` | The name of the component instance in the pipeline. Can be used in the losses. ~~str~~ | +| `cfg` | Keyword arguments passed to [`TrainablePipe.__init__`](/api/pipe#init). Will be serialized with the component. ~~Dict[str, Any]~~ | ## Serialization fields {#serialization-fields} @@ -487,11 +488,10 @@ serialization by passing in the string names via the `exclude` argument. > #### Example > > ```python -> data = pipe.to_disk("/path", exclude=["vocab"]) +> data = pipe.to_disk("/path") > ``` | Name | Description | | ------- | -------------------------------------------------------------- | -| `vocab` | The shared [`Vocab`](/api/vocab). | | `cfg` | The config file. You usually don't want to exclude this. | | `model` | The binary model data. You usually don't want to exclude this. | diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md index 6e9120022..18203e204 100644 --- a/website/docs/usage/101/_architecture.md +++ b/website/docs/usage/101/_architecture.md @@ -57,7 +57,8 @@ components for different language processing tasks and also allows adding | [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. | | [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. | | [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | -| [`Pipe`](/api/pipe) | Base class that all trainable pipeline components inherit from. | +| [`Pipe`](/api/pipe) | Base class that pipeline components may inherit from. | +| [`TrainablePipe`](/api/pipe) | Class that all trainable pipeline components inherit from. | ### Matchers {#architecture-matchers} diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 7fa60e0f1..e348c4389 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -491,13 +491,14 @@ In addition to [swapping out](#swap-architectures) default models in built-in components, you can also implement an entirely new, [trainable](/usage/processing-pipelines#trainable-components) pipeline component from scratch. This can be done by creating a new class inheriting from -[`Pipe`](/api/pipe), and linking it up to your custom model implementation. +[`TrainablePipe`](/api/pipe), and linking it up to your custom model +implementation. For details on how to implement pipeline components, check out the usage guide on [custom components](/usage/processing-pipelines#custom-component) and the -overview of the `Pipe` methods used by +overview of the `TrainablePipe` methods used by [trainable components](/usage/processing-pipelines#trainable-components). @@ -646,15 +647,15 @@ get_candidates = model.attrs["get_candidates"] To use our new relation extraction model as part of a custom [trainable component](/usage/processing-pipelines#trainable-components), we -create a subclass of [`Pipe`](/api/pipe) that holds the model. +create a subclass of [`TrainablePipe`](/api/pipe) that holds the model. ![Illustration of Pipe methods](../images/trainable_component.svg) ```python ### Pipeline component skeleton -from spacy.pipeline import Pipe +from spacy.pipeline import TrainablePipe -class RelationExtractor(Pipe): +class RelationExtractor(TrainablePipe): def __init__(self, vocab, model, name="rel"): """Create a component instance.""" self.model = model @@ -757,9 +758,10 @@ def update( When the internal model is trained, the component can be used to make novel **predictions**. The [`predict`](/api/pipe#predict) function needs to be -implemented for each subclass of `Pipe`. In our case, we can simply delegate to -the internal model's [predict](https://thinc.ai/docs/api-model#predict) function -that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array: +implemented for each subclass of `TrainablePipe`. In our case, we can simply +delegate to the internal model's +[predict](https://thinc.ai/docs/api-model#predict) function that takes a batch +of `Doc` objects and returns a ~~Floats2d~~ array: ```python ### The predict method @@ -826,7 +828,7 @@ def __call__(self, Doc doc): return doc ``` -Once our `Pipe` subclass is fully implemented, we can +Once our `TrainablePipe` subclass is fully implemented, we can [register](/usage/processing-pipelines#custom-components-factories) the component with the [`@Language.factory`](/api/language#factory) decorator. This assigns it a name and lets you create the component with diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 8b4e39ee9..e33ea6001 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1169,10 +1169,10 @@ doc = nlp("This is a text...") ## Trainable components {#trainable-components new="3"} -spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable -components that have their own model instance, make predictions over `Doc` -objects and can be updated using [`spacy train`](/api/cli#train). This lets you -plug fully custom machine learning components into your pipeline. +spaCy's [`TrainablePipe`](/api/pipe) class helps you implement your own +trainable components that have their own model instance, make predictions over +`Doc` objects and can be updated using [`spacy train`](/api/cli#train). This +lets you plug fully custom machine learning components into your pipeline. ![Illustration of Pipe methods](../images/trainable_component.svg) @@ -1183,9 +1183,9 @@ You'll need the following: a [wrapped model](/usage/layers-architectures#frameworks) implemented in PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a list of [`Doc`](/api/doc) objects as input and can have any type of output. -2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least - two methods: [`Pipe.predict`](/api/pipe#predict) and - [`Pipe.set_annotations`](/api/pipe#set_annotations). +2. **TrainablePipe subclass:** A subclass of [`TrainablePipe`](/api/pipe) that + implements at least two methods: [`TrainablePipe.predict`](/api/pipe#predict) + and [`TrainablePipe.set_annotations`](/api/pipe#set_annotations). 3. **Component factory:** A component factory registered with [`@Language.factory`](/api/language#factory) that takes the `nlp` object and component `name` and optional settings provided by the config and returns an @@ -1194,10 +1194,10 @@ You'll need the following: > #### Example > > ```python -> from spacy.pipeline import Pipe +> from spacy.pipeline import TrainablePipe > from spacy.language import Language > -> class TrainableComponent(Pipe): +> class TrainableComponent(TrainablePipe): > def predict(self, docs): > ... > @@ -1214,11 +1214,11 @@ You'll need the following: | [`predict`](/api/pipe#predict) | Apply the component's model to a batch of [`Doc`](/api/doc) objects (without modifying them) and return the scores. | | [`set_annotations`](/api/pipe#set_annotations) | Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores generated by `predict`. | -By default, [`Pipe.__init__`](/api/pipe#init) takes the shared vocab, the -[`Model`](https://thinc.ai/docs/api-model) and the name of the component +By default, [`TrainablePipe.__init__`](/api/pipe#init) takes the shared vocab, +the [`Model`](https://thinc.ai/docs/api-model) and the name of the component instance in the pipeline, which you can use as a key in the losses. All other -keyword arguments will become available as [`Pipe.cfg`](/api/pipe#cfg) and will -also be serialized with the component. +keyword arguments will become available as [`TrainablePipe.cfg`](/api/pipe#cfg) +and will also be serialized with the component. diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index d9ab00b97..250fdb4f4 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -178,7 +178,8 @@ freely combine implementations from different frameworks into a single model. - **Thinc: ** [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks), [`Model` API](https://thinc.ai/docs/api-model) -- **API:** [Model architectures](/api/architectures), [`Pipe`](/api/pipe) +- **API:** [Model architectures](/api/architectures), + [`TrainablePipe`](/api/pipe) @@ -428,7 +429,7 @@ The following methods, attributes and commands are new in spaCy v3.0. | [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | | [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. | | [`Language.disabled`](/api/language#attributes) | Names of disabled components that are not run as part of the pipeline. | -| [`Pipe.score`](/api/pipe#score) | Method on pipeline components that returns a dictionary of evaluation scores. | +| [`TrainablePipe.score`](/api/pipe#score) | Method on pipeline components that returns a dictionary of evaluation scores. | | [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). | | [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). | | [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. | @@ -483,7 +484,7 @@ format for documenting argument and return types. [`Morphologizer`](/api/morphologizer), [`AttributeRuler`](/api/attributeruler), [`SentenceRecognizer`](/api/sentencerecognizer), - [`DependencyMatcher`](/api/dependencymatcher), [`Pipe`](/api/pipe), + [`DependencyMatcher`](/api/dependencymatcher), [`TrainablePipe`](/api/pipe), [`Corpus`](/api/corpus) @@ -522,7 +523,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**. [`@Language.factory`](/api/language#factory) decorator. - The [`Language.update`](/api/language#update), [`Language.evaluate`](/api/language#evaluate) and - [`Pipe.update`](/api/pipe#update) methods now all take batches of + [`TrainablePipe.update`](/api/pipe#update) methods now all take batches of [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or raw text and a dictionary of annotations. - The `begin_training` methods have been renamed to `initialize` and now take a @@ -947,7 +948,7 @@ annotations = {"entities": [(0, 15, "PERSON"), (30, 38, "ORG")]} The [`Language.update`](/api/language#update), [`Language.evaluate`](/api/language#evaluate) and -[`Pipe.update`](/api/pipe#update) methods now all take batches of +[`TrainablePipe.update`](/api/pipe#update) methods now all take batches of [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or raw text and a dictionary of annotations. @@ -967,12 +968,13 @@ for i in range(20): nlp.update(examples) ``` -`Language.begin_training` and `Pipe.begin_training` have been renamed to -[`Language.initialize`](/api/language#initialize) and -[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function -that returns a sequence of `Example` objects to initialize the model instead of -a list of tuples. The data examples are used to **initialize the models** of -trainable pipeline components, which includes validating the network, +`Language.begin_training` and `TrainablePipe.begin_training` have been renamed +to [`Language.initialize`](/api/language#initialize) and +[`TrainablePipe.initialize`](/api/pipe#initialize), and the methods now take a +function that returns a sequence of `Example` objects to initialize the model +instead of a list of tuples. The data examples are used to **initialize the +models** of trainable pipeline components, which includes validating the +network, [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and setting up the label scheme.