Make entity_ruler ent_id resolution 2x faster and add docs for… (#4513)

* Update entityruler.py * Making ent_id resolution 2x faster and adding docs * Fixing newlines in docstrings * Fixing newlines in docstrings
2025-11-08 20:07:51 +03:00 · 2019-10-25 02:16:42 -07:00 · 2019-10-25 02:16:42 -07:00 · 93640373c7
commit 93640373c7
parent cc05d9dad6
3 changed files with 49 additions and 15 deletions
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -44,8 +44,8 @@ class EntityRuler(object):
        **cfg: Other config parameters. If pipeline component is loaded as part
            of a model pipeline, this will include all keyword arguments passed
            to `spacy.load`.
        RETURNS (EntityRuler): The newly constructed object.
        RETURNS (EntityRuler): The newly constructed object.
        DOCS: https://spacy.io/api/entityruler#init
        """
        self.nlp = nlp
@ -64,6 +64,7 @@ class EntityRuler(object):
            self.phrase_matcher_attr = None
            self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
        self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
        self._ent_ids = defaultdict(dict)
        patterns = cfg.get("patterns")
        if patterns is not None:
            self.add_patterns(patterns)
@ -82,8 +83,8 @@ class EntityRuler(object):
        """Find matches in document and add them as entities.
        doc (Doc): The Doc object in the pipeline.
        RETURNS (Doc): The Doc with added entities, if available.
        RETURNS (Doc): The Doc with added entities, if available.
        DOCS: https://spacy.io/api/entityruler#call
        """
        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
@ -100,10 +101,9 @@ class EntityRuler(object):
                continue
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
-                if self.ent_ids:
+                if match_id in self._ent_ids:
-                    label_ = self.nlp.vocab.strings[match_id]
+                    label, ent_id = self._ent_ids[match_id]
-                    ent_label, ent_id = self._split_label(label_)
+                    span = Span(doc, start, end, label=label)
                    span = Span(doc, start, end, label=ent_label)
                    if ent_id:
                        for token in span:
                            token.ent_id_ = ent_id
@ -122,7 +122,6 @@ class EntityRuler(object):
        """All labels present in the match patterns.
        RETURNS (set): The string labels.
        DOCS: https://spacy.io/api/entityruler#labels
        """
        all_labels = set(self.token_patterns.keys())
@ -131,11 +130,10 @@ class EntityRuler(object):
    @property
    def ent_ids(self):
-        """All entity ids present in the match patterns meta dicts.
+        """All entity ids present in the match patterns `id` properties
        RETURNS (set): The string entity ids.
-
+        DOCS: https://spacy.io/api/entityruler#ent_ids
        DOCS: https://spacy.io/api/entityruler#labels
        """
        all_ent_ids = set()
        for l in self.labels:
@ -149,7 +147,6 @@ class EntityRuler(object):
        """Get all patterns that were added to the entity ruler.
        RETURNS (list): The original patterns, one dictionary per pattern.
        DOCS: https://spacy.io/api/entityruler#patterns
        """
        all_patterns = []
@ -175,7 +172,6 @@ class EntityRuler(object):
        pattern (list of dicts) or a phrase pattern (string). For example:
        {'label': 'ORG', 'pattern': 'Apple'}
        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
        patterns (list): The patterns to add.
        DOCS: https://spacy.io/api/entityruler#add_patterns
@ -192,7 +188,11 @@ class EntityRuler(object):
            for entry in patterns:
                label = entry["label"]
                if "id" in entry:
                    ent_label = label
                    label = self._create_label(label, entry["id"])
                    key = self.matcher._normalize_key(label)
                    self._ent_ids[key] = (ent_label, entry["id"])
                pattern = entry["pattern"]
                if isinstance(pattern, basestring_):
                    self.phrase_patterns[label].append(self.nlp(pattern))
@ -232,8 +232,8 @@ class EntityRuler(object):
        patterns_bytes (bytes): The bytestring to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.
        RETURNS (EntityRuler): The loaded entity ruler.
        DOCS: https://spacy.io/api/entityruler#from_bytes
        """
        cfg = srsly.msgpack_loads(patterns_bytes)
@ -254,7 +254,6 @@ class EntityRuler(object):
        """Serialize the entity ruler patterns to a bytestring.
        RETURNS (bytes): The serialized patterns.
        DOCS: https://spacy.io/api/entityruler#to_bytes
        """
@ -274,8 +273,8 @@ class EntityRuler(object):
        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.
        RETURNS (EntityRuler): The loaded entity ruler.
        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@ -202,6 +202,14 @@ All labels present in the match patterns.
 | ----------- | ----- | ------------------ |
 | **RETURNS** | tuple | The string labels. |
 ## EntityRuler.ent_ids {#labels tag="property"}
 All entity ids present in the match patterns `id` properties
 | Name        | Type  | Description        |
 | ----------- | ----- | ------------------ |
 | **RETURNS** | tuple | The string ent_ids. |
 ## EntityRuler.patterns {#patterns tag="property"}
 Get all patterns that were added to the entity ruler.
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -986,6 +986,33 @@ doc = nlp("Apple is opening its first big office in San Francisco.")
 print([(ent.text, ent.label_) for ent in doc.ents])
 ```
 ### Adding ids to patterns {#entityruler-ent-ids}
 The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each pattern. Using the `id` attribute allows multiple patterns to be associated with the same entity.
 ```python
 ### {executable="true"}
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
 nlp = English()
 ruler = EntityRuler(nlp)
 patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
 ruler.add_patterns(patterns)
 nlp.add_pipe(ruler)
 doc1 = nlp("Apple is opening its first big office in San Francisco.")
 print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
 doc2 = nlp("Apple is opening its first big office in San Fran.")
 print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
 ```
 If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) patterns, the `ent_id_` property of the matched entity is set to the `id` given in the patterns. So in the example above it's easy to identify that "San Francisco" and "San Fran" are both the same entity.
 The entity ruler is designed to integrate with spaCy's existing statistical
 models and enhance the named entity recognizer. If it's added **before the
 `"ner"` component**, the entity recognizer will respect the existing entity