Make entity_ruler ent_id resolution 2x faster and add docs for… (#4513)

* Update entityruler.py * Making ent_id resolution 2x faster and adding docs * Fixing newlines in docstrings * Fixing newlines in docstrings
2025-07-25 23:49:46 +03:00 · 2019-10-25 02:16:42 -07:00 · 2019-10-25 02:16:42 -07:00 · 93640373c7
commit 93640373c7
parent cc05d9dad6
3 changed files with 49 additions and 15 deletions
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -44,8 +44,8 @@ class EntityRuler(object):
        **cfg: Other config parameters. If pipeline component is loaded as part
            of a model pipeline, this will include all keyword arguments passed
            to `spacy.load`.
-        RETURNS (EntityRuler): The newly constructed object.

+        RETURNS (EntityRuler): The newly constructed object.
        DOCS: https://spacy.io/api/entityruler#init
        """
        self.nlp = nlp
@ -64,6 +64,7 @@ class EntityRuler(object):
            self.phrase_matcher_attr = None
            self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
        self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
+        self._ent_ids = defaultdict(dict)
        patterns = cfg.get("patterns")
        if patterns is not None:
            self.add_patterns(patterns)
@ -82,8 +83,8 @@ class EntityRuler(object):
        """Find matches in document and add them as entities.

        doc (Doc): The Doc object in the pipeline.
-        RETURNS (Doc): The Doc with added entities, if available.

+        RETURNS (Doc): The Doc with added entities, if available.
        DOCS: https://spacy.io/api/entityruler#call
        """
        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
@ -100,10 +101,9 @@ class EntityRuler(object):
                continue
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
-                if self.ent_ids:
-                    label_ = self.nlp.vocab.strings[match_id]
-                    ent_label, ent_id = self._split_label(label_)
-                    span = Span(doc, start, end, label=ent_label)
+                if match_id in self._ent_ids:
+                    label, ent_id = self._ent_ids[match_id]
+                    span = Span(doc, start, end, label=label)
                    if ent_id:
                        for token in span:
                            token.ent_id_ = ent_id
@ -122,7 +122,6 @@ class EntityRuler(object):
        """All labels present in the match patterns.

        RETURNS (set): The string labels.
-
        DOCS: https://spacy.io/api/entityruler#labels
        """
        all_labels = set(self.token_patterns.keys())
@ -131,11 +130,10 @@ class EntityRuler(object):

    @property
    def ent_ids(self):
-        """All entity ids present in the match patterns meta dicts.
+        """All entity ids present in the match patterns `id` properties

        RETURNS (set): The string entity ids.
-
-        DOCS: https://spacy.io/api/entityruler#labels
+        DOCS: https://spacy.io/api/entityruler#ent_ids
        """
        all_ent_ids = set()
        for l in self.labels:
@ -149,7 +147,6 @@ class EntityRuler(object):
        """Get all patterns that were added to the entity ruler.

        RETURNS (list): The original patterns, one dictionary per pattern.
-
        DOCS: https://spacy.io/api/entityruler#patterns
        """
        all_patterns = []
@ -175,7 +172,6 @@ class EntityRuler(object):
        pattern (list of dicts) or a phrase pattern (string). For example:
        {'label': 'ORG', 'pattern': 'Apple'}
        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
-
        patterns (list): The patterns to add.

        DOCS: https://spacy.io/api/entityruler#add_patterns
@ -192,7 +188,11 @@ class EntityRuler(object):
            for entry in patterns:
                label = entry["label"]
                if "id" in entry:
+                    ent_label = label
                    label = self._create_label(label, entry["id"])
+                    key = self.matcher._normalize_key(label)
+                    self._ent_ids[key] = (ent_label, entry["id"])
+
                pattern = entry["pattern"]
                if isinstance(pattern, basestring_):
                    self.phrase_patterns[label].append(self.nlp(pattern))
@ -232,8 +232,8 @@ class EntityRuler(object):

        patterns_bytes (bytes): The bytestring to load.
        **kwargs: Other config paramters, mostly for consistency.
-        RETURNS (EntityRuler): The loaded entity ruler.

+        RETURNS (EntityRuler): The loaded entity ruler.
        DOCS: https://spacy.io/api/entityruler#from_bytes
        """
        cfg = srsly.msgpack_loads(patterns_bytes)
@ -254,7 +254,6 @@ class EntityRuler(object):
        """Serialize the entity ruler patterns to a bytestring.

        RETURNS (bytes): The serialized patterns.
-
        DOCS: https://spacy.io/api/entityruler#to_bytes
        """

@ -274,8 +273,8 @@ class EntityRuler(object):

        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
-        RETURNS (EntityRuler): The loaded entity ruler.

+        RETURNS (EntityRuler): The loaded entity ruler.
        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@ -202,6 +202,14 @@ All labels present in the match patterns.
 | ----------- | ----- | ------------------ |
 | **RETURNS** | tuple | The string labels. |

+## EntityRuler.ent_ids {#labels tag="property"}
+
+All entity ids present in the match patterns `id` properties
+
+| Name        | Type  | Description        |
+| ----------- | ----- | ------------------ |
+| **RETURNS** | tuple | The string ent_ids. |
+
 ## EntityRuler.patterns {#patterns tag="property"}

 Get all patterns that were added to the entity ruler.
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -986,6 +986,33 @@ doc = nlp("Apple is opening its first big office in San Francisco.")
 print([(ent.text, ent.label_) for ent in doc.ents])
 ```

+### Adding ids to patterns {#entityruler-ent-ids}
+
+The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each pattern. Using the `id` attribute allows multiple patterns to be associated with the same entity.
+
+```python
+### {executable="true"}
+from spacy.lang.en import English
+from spacy.pipeline import EntityRuler
+
+nlp = English()
+ruler = EntityRuler(nlp)
+patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
+            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
+            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
+ruler.add_patterns(patterns)
+nlp.add_pipe(ruler)
+
+doc1 = nlp("Apple is opening its first big office in San Francisco.")
+print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
+
+doc2 = nlp("Apple is opening its first big office in San Fran.")
+print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
+```
+
+If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) patterns, the `ent_id_` property of the matched entity is set to the `id` given in the patterns. So in the example above it's easy to identify that "San Francisco" and "San Fran" are both the same entity.
+
+
 The entity ruler is designed to integrate with spaCy's existing statistical
 models and enhance the named entity recognizer. If it's added **before the
 `"ner"` component**, the entity recognizer will respect the existing entity