Make entity_ruler ent_id resolution 2x faster and add docs for… (#4513)

* Update entityruler.py

* Making ent_id resolution 2x faster and adding docs

* Fixing newlines in docstrings

* Fixing newlines in docstrings
This commit is contained in:
Kabir Khan 2019-10-25 02:16:42 -07:00 committed by Ines Montani
parent cc05d9dad6
commit 93640373c7
3 changed files with 49 additions and 15 deletions

View File

@ -44,8 +44,8 @@ class EntityRuler(object):
**cfg: Other config parameters. If pipeline component is loaded as part **cfg: Other config parameters. If pipeline component is loaded as part
of a model pipeline, this will include all keyword arguments passed of a model pipeline, this will include all keyword arguments passed
to `spacy.load`. to `spacy.load`.
RETURNS (EntityRuler): The newly constructed object.
RETURNS (EntityRuler): The newly constructed object.
DOCS: https://spacy.io/api/entityruler#init DOCS: https://spacy.io/api/entityruler#init
""" """
self.nlp = nlp self.nlp = nlp
@ -64,6 +64,7 @@ class EntityRuler(object):
self.phrase_matcher_attr = None self.phrase_matcher_attr = None
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate) self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
self._ent_ids = defaultdict(dict)
patterns = cfg.get("patterns") patterns = cfg.get("patterns")
if patterns is not None: if patterns is not None:
self.add_patterns(patterns) self.add_patterns(patterns)
@ -82,8 +83,8 @@ class EntityRuler(object):
"""Find matches in document and add them as entities. """Find matches in document and add them as entities.
doc (Doc): The Doc object in the pipeline. doc (Doc): The Doc object in the pipeline.
RETURNS (Doc): The Doc with added entities, if available.
RETURNS (Doc): The Doc with added entities, if available.
DOCS: https://spacy.io/api/entityruler#call DOCS: https://spacy.io/api/entityruler#call
""" """
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
@ -100,10 +101,9 @@ class EntityRuler(object):
continue continue
# check for end - 1 here because boundaries are inclusive # check for end - 1 here because boundaries are inclusive
if start not in seen_tokens and end - 1 not in seen_tokens: if start not in seen_tokens and end - 1 not in seen_tokens:
if self.ent_ids: if match_id in self._ent_ids:
label_ = self.nlp.vocab.strings[match_id] label, ent_id = self._ent_ids[match_id]
ent_label, ent_id = self._split_label(label_) span = Span(doc, start, end, label=label)
span = Span(doc, start, end, label=ent_label)
if ent_id: if ent_id:
for token in span: for token in span:
token.ent_id_ = ent_id token.ent_id_ = ent_id
@ -122,7 +122,6 @@ class EntityRuler(object):
"""All labels present in the match patterns. """All labels present in the match patterns.
RETURNS (set): The string labels. RETURNS (set): The string labels.
DOCS: https://spacy.io/api/entityruler#labels DOCS: https://spacy.io/api/entityruler#labels
""" """
all_labels = set(self.token_patterns.keys()) all_labels = set(self.token_patterns.keys())
@ -131,11 +130,10 @@ class EntityRuler(object):
@property @property
def ent_ids(self): def ent_ids(self):
"""All entity ids present in the match patterns meta dicts. """All entity ids present in the match patterns `id` properties
RETURNS (set): The string entity ids. RETURNS (set): The string entity ids.
DOCS: https://spacy.io/api/entityruler#ent_ids
DOCS: https://spacy.io/api/entityruler#labels
""" """
all_ent_ids = set() all_ent_ids = set()
for l in self.labels: for l in self.labels:
@ -149,7 +147,6 @@ class EntityRuler(object):
"""Get all patterns that were added to the entity ruler. """Get all patterns that were added to the entity ruler.
RETURNS (list): The original patterns, one dictionary per pattern. RETURNS (list): The original patterns, one dictionary per pattern.
DOCS: https://spacy.io/api/entityruler#patterns DOCS: https://spacy.io/api/entityruler#patterns
""" """
all_patterns = [] all_patterns = []
@ -175,7 +172,6 @@ class EntityRuler(object):
pattern (list of dicts) or a phrase pattern (string). For example: pattern (list of dicts) or a phrase pattern (string). For example:
{'label': 'ORG', 'pattern': 'Apple'} {'label': 'ORG', 'pattern': 'Apple'}
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]} {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
patterns (list): The patterns to add. patterns (list): The patterns to add.
DOCS: https://spacy.io/api/entityruler#add_patterns DOCS: https://spacy.io/api/entityruler#add_patterns
@ -192,7 +188,11 @@ class EntityRuler(object):
for entry in patterns: for entry in patterns:
label = entry["label"] label = entry["label"]
if "id" in entry: if "id" in entry:
ent_label = label
label = self._create_label(label, entry["id"]) label = self._create_label(label, entry["id"])
key = self.matcher._normalize_key(label)
self._ent_ids[key] = (ent_label, entry["id"])
pattern = entry["pattern"] pattern = entry["pattern"]
if isinstance(pattern, basestring_): if isinstance(pattern, basestring_):
self.phrase_patterns[label].append(self.nlp(pattern)) self.phrase_patterns[label].append(self.nlp(pattern))
@ -232,8 +232,8 @@ class EntityRuler(object):
patterns_bytes (bytes): The bytestring to load. patterns_bytes (bytes): The bytestring to load.
**kwargs: Other config paramters, mostly for consistency. **kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler.
RETURNS (EntityRuler): The loaded entity ruler.
DOCS: https://spacy.io/api/entityruler#from_bytes DOCS: https://spacy.io/api/entityruler#from_bytes
""" """
cfg = srsly.msgpack_loads(patterns_bytes) cfg = srsly.msgpack_loads(patterns_bytes)
@ -254,7 +254,6 @@ class EntityRuler(object):
"""Serialize the entity ruler patterns to a bytestring. """Serialize the entity ruler patterns to a bytestring.
RETURNS (bytes): The serialized patterns. RETURNS (bytes): The serialized patterns.
DOCS: https://spacy.io/api/entityruler#to_bytes DOCS: https://spacy.io/api/entityruler#to_bytes
""" """
@ -274,8 +273,8 @@ class EntityRuler(object):
path (unicode / Path): The JSONL file to load. path (unicode / Path): The JSONL file to load.
**kwargs: Other config paramters, mostly for consistency. **kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler.
RETURNS (EntityRuler): The loaded entity ruler.
DOCS: https://spacy.io/api/entityruler#from_disk DOCS: https://spacy.io/api/entityruler#from_disk
""" """
path = ensure_path(path) path = ensure_path(path)

View File

@ -202,6 +202,14 @@ All labels present in the match patterns.
| ----------- | ----- | ------------------ | | ----------- | ----- | ------------------ |
| **RETURNS** | tuple | The string labels. | | **RETURNS** | tuple | The string labels. |
## EntityRuler.ent_ids {#labels tag="property"}
All entity ids present in the match patterns `id` properties
| Name | Type | Description |
| ----------- | ----- | ------------------ |
| **RETURNS** | tuple | The string ent_ids. |
## EntityRuler.patterns {#patterns tag="property"} ## EntityRuler.patterns {#patterns tag="property"}
Get all patterns that were added to the entity ruler. Get all patterns that were added to the entity ruler.

View File

@ -986,6 +986,33 @@ doc = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents]) print([(ent.text, ent.label_) for ent in doc.ents])
``` ```
### Adding ids to patterns {#entityruler-ent-ids}
The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each pattern. Using the `id` attribute allows multiple patterns to be associated with the same entity.
```python
### {executable="true"}
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
nlp = English()
ruler = EntityRuler(nlp)
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc1 = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
```
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) patterns, the `ent_id_` property of the matched entity is set to the `id` given in the patterns. So in the example above it's easy to identify that "San Francisco" and "San Fran" are both the same entity.
The entity ruler is designed to integrate with spaCy's existing statistical The entity ruler is designed to integrate with spaCy's existing statistical
models and enhance the named entity recognizer. If it's added **before the models and enhance the named entity recognizer. If it's added **before the
`"ner"` component**, the entity recognizer will respect the existing entity `"ner"` component**, the entity recognizer will respect the existing entity