Merge pull request #6206 from svlandeg/fix/patterns-init

2025-07-05 12:23:06 +03:00 · 2020-10-06 10:27:23 +02:00 · 2020-10-06 10:27:23 +02:00 · 568e12215d
commit 568e12215d
parent 2e961817cb ff9ac39c88
17 changed files with 156 additions and 39 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1091,10 +1091,11 @@ class Language:
            for name, proc in self.pipeline:
                if (
                    name not in exclude
-                    and hasattr(proc, "model")
+                    and hasattr(proc, "is_trainable")
                    and proc.is_trainable()
                    and proc.model not in (True, False, None)
                ):
-                    proc.model.finish_update(sgd)
+                    proc.finish_update(sgd)
        return losses
    def rehearse(
@ -1297,7 +1298,9 @@ class Language:
        for name, pipe in self.pipeline:
            kwargs = component_cfg.get(name, {})
            kwargs.setdefault("batch_size", batch_size)
-            if not hasattr(pipe, "pipe"):
+            # non-trainable components may have a pipe() implementation that refers to dummy
            # predict and set_annotations methods
            if not hasattr(pipe, "pipe") or not hasattr(pipe, "is_trainable") or not pipe.is_trainable():
                docs = _pipe(docs, pipe, kwargs)
            else:
                docs = pipe.pipe(docs, **kwargs)
@ -1407,7 +1410,9 @@ class Language:
            kwargs = component_cfg.get(name, {})
            # Allow component_cfg to overwrite the top-level kwargs.
            kwargs.setdefault("batch_size", batch_size)
-            if hasattr(proc, "pipe"):
+            # non-trainable components may have a pipe() implementation that refers to dummy
            # predict and set_annotations methods
            if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable():
                f = functools.partial(proc.pipe, **kwargs)
            else:
                # Apply the function, but yield the doc
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -238,7 +238,7 @@ class EntityLinker(Pipe):
        )
        bp_context(d_scores)
        if sgd is not None:
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
        losses[self.name] += loss
        if set_annotations:
            self.set_annotations(docs, predictions)
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -1,8 +1,10 @@
-from typing import Optional, Union, List, Dict, Tuple, Iterable, Any
+from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
 from collections import defaultdict
 from pathlib import Path
 import srsly
 from .pipe import Pipe
 from ..training import Example
 from ..language import Language
 from ..errors import Errors
 from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
@ -50,7 +52,7 @@ def make_entity_ruler(
    )
-class EntityRuler:
+class EntityRuler(Pipe):
    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
    rules or exact phrase matches. It can be combined with the statistical
    `EntityRecognizer` to boost accuracy, or used on its own to implement a
@ -183,6 +185,26 @@ class EntityRuler:
                all_labels.add(l)
        return tuple(all_labels)
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
        nlp: Optional[Language] = None,
        patterns: Optional[Sequence[PatternType]] = None,
    ):
        """Initialize the pipe for training.
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        patterns Optional[Iterable[PatternType]]: The list of patterns.
        DOCS: https://nightly.spacy.io/api/entityruler#initialize
        """
        if patterns:
            self.add_patterns(patterns)
    @property
    def ent_ids(self) -> Tuple[str, ...]:
        """All entity ids present in the match patterns `id` properties
@ -320,6 +342,12 @@ class EntityRuler:
        validate_examples(examples, "EntityRuler.score")
        return Scorer.score_spans(examples, "ents", **kwargs)
    def predict(self, docs):
        pass
    def set_annotations(self, docs, scores):
        pass
    def from_bytes(
        self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "EntityRuler":
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -209,7 +209,7 @@ class ClozeMultitask(Pipe):
        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
        bp_predictions(d_predictions)
        if sgd is not None:
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
        if losses is not None:
            losses[self.name] += loss
        return losses
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -132,7 +132,7 @@ cdef class Pipe:
        loss, d_scores = self.get_loss(examples, scores)
        bp_scores(d_scores)
        if sgd not in (None, False):
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
        losses[self.name] += loss
        if set_annotations:
            docs = [eg.predicted for eg in examples]
@ -228,6 +228,9 @@ cdef class Pipe:
    def is_resizable(self):
        return hasattr(self, "model") and "resize_output" in self.model.attrs
    def is_trainable(self):
        return hasattr(self, "model") and isinstance(self.model, Model)
    def set_output(self, nO):
        if self.is_resizable():
            self.model.attrs["resize_output"](self.model, nO)
@ -245,6 +248,17 @@ cdef class Pipe:
        with self.model.use_params(params):
            yield
    def finish_update(self, sgd):
        """Update parameters using the current parameter gradients.
        The Optimizer instance contains the functionality to perform
        the stochastic gradient descent.
        sgd (thinc.api.Optimizer): The optimizer.
        DOCS: https://nightly.spacy.io/api/pipe#finish_update
        """
        self.model.finish_update(sgd)
    def score(self, examples, **kwargs):
        """Score a batch of examples.
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -203,7 +203,7 @@ class Tagger(Pipe):
        loss, d_tag_scores = self.get_loss(examples, tag_scores)
        bp_tag_scores(d_tag_scores)
        if sgd not in (None, False):
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
        losses[self.name] += loss
        if set_annotations:
@ -238,7 +238,7 @@ class Tagger(Pipe):
        target = self._rehearsal_model(examples)
        gradient = guesses - target
        backprop(gradient)
-        self.model.finish_update(sgd)
+        self.finish_update(sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += (gradient**2).sum()
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -212,7 +212,7 @@ class TextCategorizer(Pipe):
        loss, d_scores = self.get_loss(examples, scores)
        bp_scores(d_scores)
        if sgd is not None:
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
        losses[self.name] += loss
        if set_annotations:
            docs = [eg.predicted for eg in examples]
@ -256,7 +256,7 @@ class TextCategorizer(Pipe):
        gradient = scores - target
        bp_scores(gradient)
        if sgd is not None:
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
        if losses is not None:
            losses[self.name] += (gradient ** 2).sum()
        return losses
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -188,7 +188,7 @@ class Tok2Vec(Pipe):
            accumulate_gradient(one_d_tokvecs)
            d_docs = bp_tokvecs(d_tokvecs)
            if sgd is not None:
-                self.model.finish_update(sgd)
+                self.finish_update(sgd)
            return d_docs
        batch_id = Tok2VecListener.get_batch_id(docs)
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -315,7 +315,7 @@ cdef class Parser(Pipe):
        backprop_tok2vec(golds)
        if sgd not in (None, False):
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
        if set_annotations:
            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, all_states)
@ -367,7 +367,7 @@ cdef class Parser(Pipe):
        # Do the backprop
        backprop_tok2vec(docs)
        if sgd is not None:
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
        losses[self.name] += loss / n_scores
        del backprop
        del backprop_tok2vec
@ -437,7 +437,9 @@ cdef class Parser(Pipe):
            for name, component in nlp.pipeline:
                if component is self:
                    break
-                if hasattr(component, "pipe"):
+                # non-trainable components may have a pipe() implementation that refers to dummy
                # predict and set_annotations methods
                if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable():
                    doc_sample = list(component.pipe(doc_sample, batch_size=8))
                else:
                    doc_sample = [component(doc) for doc in doc_sample]
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -119,7 +119,7 @@ def validate_init_settings(
    if types don't match or required values are missing.
    func (Callable): The initialize method of a given component etc.
-    settings (Dict[str, Any]): The settings from the repsective [initialize] block.
+    settings (Dict[str, Any]): The settings from the respective [initialize] block.
    section (str): Initialize section, for error message.
    name (str): Name of the block in the section.
    exclude (Iterable[str]): Parameter names to exclude from schema.
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@ -121,7 +121,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")
    nlp.remove_pipe("attribute_ruler")
-    # initialize with patterns from asset
+    # initialize with patterns from misc registry
    nlp.config["initialize"]["components"]["attribute_ruler"] = {
        "patterns": {"@misc": "attribute_ruler_patterns"}
    }
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@ -1,4 +1,6 @@
 import pytest
 from spacy import registry
 from spacy.tokens import Span
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
@ -11,6 +13,7 @@ def nlp():
@pytest.fixture
@registry.misc("entity_ruler_patterns")
 def patterns():
    return [
        {"label": "HELLO", "pattern": "hello world"},
@ -42,6 +45,29 @@ def test_entity_ruler_init(nlp, patterns):
    assert doc.ents[1].label_ == "BYE"
 def test_entity_ruler_init_patterns(nlp, patterns):
    # initialize with patterns
    ruler = nlp.add_pipe("entity_ruler")
    assert len(ruler.labels) == 0
    ruler.initialize(lambda: [], patterns=patterns)
    assert len(ruler.labels) == 4
    doc = nlp("hello world bye bye")
    assert doc.ents[0].label_ == "HELLO"
    assert doc.ents[1].label_ == "BYE"
    nlp.remove_pipe("entity_ruler")
    # initialize with patterns from misc registry
    nlp.config["initialize"]["components"]["entity_ruler"] = {
        "patterns": {"@misc": "entity_ruler_patterns"}
    }
    ruler = nlp.add_pipe("entity_ruler")
    assert len(ruler.labels) == 0
    nlp.initialize()
    assert len(ruler.labels) == 4
    doc = nlp("hello world bye bye")
    assert doc.ents[0].label_ == "HELLO"
    assert doc.ents[1].label_ == "BYE"
 def test_entity_ruler_existing(nlp, patterns):
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -49,7 +49,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
            nlp.resume_training(sgd=optimizer)
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
-        logger.info("Initialized pipeline components")
+        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
    return nlp
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@ -17,8 +17,12 @@ def console_logger(progress_bar: bool = False):
        nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
    ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
        msg = Printer(no_print=True)
-        # we assume here that only components are enabled that should be trained & logged
+        # ensure that only trainable components are logged
-        logged_pipes = nlp.pipe_names
+        logged_pipes = [
            name
            for name, proc in nlp.pipeline
            if hasattr(proc, "is_trainable") and proc.is_trainable()
        ]
        eval_frequency = nlp.config["training"]["eval_frequency"]
        score_weights = nlp.config["training"]["score_weights"]
        score_cols = [col for col, value in score_weights.items() if value is not None]
@ -41,19 +45,10 @@ def console_logger(progress_bar: bool = False):
                if progress is not None:
                    progress.update(1)
                return
            try:
            losses = [
                "{0:.2f}".format(float(info["losses"][pipe_name]))
                for pipe_name in logged_pipes
            ]
            except KeyError as e:
                raise KeyError(
                    Errors.E983.format(
                        dict="scores (losses)",
                        key=str(e),
                        keys=list(info["losses"].keys()),
                    )
                ) from None
            scores = []
            for col in score_cols:
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -187,10 +187,11 @@ def train_while_improving(
        for name, proc in nlp.pipeline:
            if (
                name not in exclude
-                and hasattr(proc, "model")
+                and hasattr(proc, "is_trainable")
                and proc.is_trainable()
                and proc.model not in (True, False, None)
            ):
-                proc.model.finish_update(optimizer)
+                proc.finish_update(optimizer)
        optimizer.step_schedules()
        if not (step % eval_frequency):
            if optimizer.averages:
@ -293,6 +294,7 @@ def update_meta(
        if metric is not None:
            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
    for pipe_name in nlp.pipe_names:
        if pipe_name in info["losses"]:
            nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@ -74,6 +74,33 @@ be a token pattern (list) or a phrase pattern (string). For example:
 | `ent_id_sep`                      | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~                                                                                                                                                                 |
 | `patterns`                        | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
 ## EntityRuler.initialize {#initialize tag="method" new="3"}
 Initialize the component with patterns from a file.
 > #### Example
 >
 > ```python
 > entity_ruler = nlp.add_pipe("entity_ruler")
 > entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
 > ```
 >
 > ```ini
 > ### config.cfg
 > [initialize.components.entity_ruler]
 >
 > [initialize.components.entity_ruler.patterns]
 > @readers = "srsly.read_jsonl.v1"
 > path = "corpus/entity_ruler_patterns.jsonl
 > ```
 | Name           | Description                                                                                                                                                          |
 | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                                                      |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
 | `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
 ## EntityRuler.\_\len\_\_ {#len tag="method"}
 The number of all patterns added to the entity ruler.
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@ -294,6 +294,24 @@ context, the original parameters are restored.
 | -------- | -------------------------------------------------- |
 | `params` | The parameter values to use in the model. ~~dict~~ |
 ## Pipe.finish_update {#finish_update tag="method"}
 Update parameters using the current parameter gradients. Defaults to calling
 [`self.model.finish_update`](https://thinc.ai/docs/api-model#finish_update).
 > #### Example
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
 > optimizer = nlp.initialize()
 > losses = pipe.update(examples, sgd=None)
 > pipe.finish_update(sgd)
 > ```
 | Name  | Description                           |
 | ----- | ------------------------------------- |
 | `sgd` | An optimizer. ~~Optional[Optimizer]~~ |
 ## Pipe.add_label {#add_label tag="method"}
 > #### Example