Make entity_ruler ent_id resolution 2x faster and add docs for… (#4513)

* Update entityruler.py

* Making ent_id resolution 2x faster and adding docs

* Fixing newlines in docstrings

* Fixing newlines in docstrings
This commit is contained in:
Kabir Khan 2019-10-25 02:16:42 -07:00 committed by Ines Montani
parent cc05d9dad6
commit 93640373c7
3 changed files with 49 additions and 15 deletions

View File

@ -44,8 +44,8 @@ class EntityRuler(object):
**cfg: Other config parameters. If pipeline component is loaded as part
of a model pipeline, this will include all keyword arguments passed
to `spacy.load`.
RETURNS (EntityRuler): The newly constructed object.
RETURNS (EntityRuler): The newly constructed object.
DOCS: https://spacy.io/api/entityruler#init
"""
self.nlp = nlp
@ -64,6 +64,7 @@ class EntityRuler(object):
self.phrase_matcher_attr = None
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
self._ent_ids = defaultdict(dict)
patterns = cfg.get("patterns")
if patterns is not None:
self.add_patterns(patterns)
@ -82,8 +83,8 @@ class EntityRuler(object):
"""Find matches in document and add them as entities.
doc (Doc): The Doc object in the pipeline.
RETURNS (Doc): The Doc with added entities, if available.
RETURNS (Doc): The Doc with added entities, if available.
DOCS: https://spacy.io/api/entityruler#call
"""
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
@ -100,10 +101,9 @@ class EntityRuler(object):
continue
# check for end - 1 here because boundaries are inclusive
if start not in seen_tokens and end - 1 not in seen_tokens:
if self.ent_ids:
label_ = self.nlp.vocab.strings[match_id]
ent_label, ent_id = self._split_label(label_)
span = Span(doc, start, end, label=ent_label)
if match_id in self._ent_ids:
label, ent_id = self._ent_ids[match_id]
span = Span(doc, start, end, label=label)
if ent_id:
for token in span:
token.ent_id_ = ent_id
@ -122,7 +122,6 @@ class EntityRuler(object):
"""All labels present in the match patterns.
RETURNS (set): The string labels.
DOCS: https://spacy.io/api/entityruler#labels
"""
all_labels = set(self.token_patterns.keys())
@ -131,11 +130,10 @@ class EntityRuler(object):
@property
def ent_ids(self):
"""All entity ids present in the match patterns meta dicts.
"""All entity ids present in the match patterns `id` properties
RETURNS (set): The string entity ids.
DOCS: https://spacy.io/api/entityruler#labels
DOCS: https://spacy.io/api/entityruler#ent_ids
"""
all_ent_ids = set()
for l in self.labels:
@ -149,7 +147,6 @@ class EntityRuler(object):
"""Get all patterns that were added to the entity ruler.
RETURNS (list): The original patterns, one dictionary per pattern.
DOCS: https://spacy.io/api/entityruler#patterns
"""
all_patterns = []
@ -175,7 +172,6 @@ class EntityRuler(object):
pattern (list of dicts) or a phrase pattern (string). For example:
{'label': 'ORG', 'pattern': 'Apple'}
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
patterns (list): The patterns to add.
DOCS: https://spacy.io/api/entityruler#add_patterns
@ -192,7 +188,11 @@ class EntityRuler(object):
for entry in patterns:
label = entry["label"]
if "id" in entry:
ent_label = label
label = self._create_label(label, entry["id"])
key = self.matcher._normalize_key(label)
self._ent_ids[key] = (ent_label, entry["id"])
pattern = entry["pattern"]
if isinstance(pattern, basestring_):
self.phrase_patterns[label].append(self.nlp(pattern))
@ -232,8 +232,8 @@ class EntityRuler(object):
patterns_bytes (bytes): The bytestring to load.
**kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler.
RETURNS (EntityRuler): The loaded entity ruler.
DOCS: https://spacy.io/api/entityruler#from_bytes
"""
cfg = srsly.msgpack_loads(patterns_bytes)
@ -254,7 +254,6 @@ class EntityRuler(object):
"""Serialize the entity ruler patterns to a bytestring.
RETURNS (bytes): The serialized patterns.
DOCS: https://spacy.io/api/entityruler#to_bytes
"""
@ -274,8 +273,8 @@ class EntityRuler(object):
path (unicode / Path): The JSONL file to load.
**kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler.
RETURNS (EntityRuler): The loaded entity ruler.
DOCS: https://spacy.io/api/entityruler#from_disk
"""
path = ensure_path(path)

View File

@ -202,6 +202,14 @@ All labels present in the match patterns.
| ----------- | ----- | ------------------ |
| **RETURNS** | tuple | The string labels. |
## EntityRuler.ent_ids {#labels tag="property"}
All entity ids present in the match patterns `id` properties
| Name | Type | Description |
| ----------- | ----- | ------------------ |
| **RETURNS** | tuple | The string ent_ids. |
## EntityRuler.patterns {#patterns tag="property"}
Get all patterns that were added to the entity ruler.

View File

@ -986,6 +986,33 @@ doc = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents])
```
### Adding ids to patterns {#entityruler-ent-ids}
The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each pattern. Using the `id` attribute allows multiple patterns to be associated with the same entity.
```python
### {executable="true"}
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
nlp = English()
ruler = EntityRuler(nlp)
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc1 = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
```
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) patterns, the `ent_id_` property of the matched entity is set to the `id` given in the patterns. So in the example above it's easy to identify that "San Francisco" and "San Fran" are both the same entity.
The entity ruler is designed to integrate with spaCy's existing statistical
models and enhance the named entity recognizer. If it's added **before the
`"ner"` component**, the entity recognizer will respect the existing entity