add initialize method for entity_ruler

2025-11-10 21:07:53 +03:00 · 2020-10-05 14:59:13 +02:00 · 2020-10-05 14:59:13 +02:00 · 251b3eb4e5
commit 251b3eb4e5
parent e3acad6264
3 changed files with 32 additions and 2 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -456,6 +456,8 @@ class Errors:
            "issue tracker: http://github.com/explosion/spaCy/issues")
    # TODO: fix numbering after merging develop into master
    E900 = ("Patterns for component '{name}' not initialized. This can be fixed "
            "by calling 'add_patterns' or 'initialize'.")
    E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
            "Try checking whitespace and delimiters. See "
            "https://nightly.spacy.io/api/cli#convert")
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -1,7 +1,8 @@
-from typing import Optional, Union, List, Dict, Tuple, Iterable, Any
+from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable
 from collections import defaultdict
 from pathlib import Path
 import srsly
 from spacy.training import Example
 from ..language import Language
 from ..errors import Errors
@ -133,6 +134,7 @@ class EntityRuler:
        DOCS: https://nightly.spacy.io/api/entityruler#call
        """
        self._require_patterns()
        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
        matches = set(
            [(m_id, start, end) for m_id, start, end in matches if start != end]
@ -183,6 +185,27 @@ class EntityRuler:
                all_labels.add(l)
        return tuple(all_labels)
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
        nlp: Optional[Language] = None,
        patterns_path: Optional[Path] = None
    ):
        """Initialize the pipe for training.
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        patterns_path: Path to serialized patterns.
        DOCS (TODO): https://nightly.spacy.io/api/entityruler#initialize
        """
        if patterns_path:
            patterns = srsly.read_jsonl(patterns_path)
            self.add_patterns(patterns)
    @property
    def ent_ids(self) -> Tuple[str, ...]:
        """All entity ids present in the match patterns `id` properties
@ -292,6 +315,11 @@ class EntityRuler:
        self.phrase_patterns = defaultdict(list)
        self._ent_ids = defaultdict(dict)
    def _require_patterns(self) -> None:
        """Raise an error if the component has no patterns."""
        if not self.patterns or list(self.patterns) == [""]:
            raise ValueError(Errors.E900.format(name=self.name))
    def _split_label(self, label: str) -> Tuple[str, str]:
        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -49,7 +49,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
            nlp.resume_training(sgd=optimizer)
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
-        logger.info("Initialized pipeline components")
+        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
    return nlp