mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Make entity_ruler ent_id resolution 2x faster and add docs for… (#4513)
* Update entityruler.py * Making ent_id resolution 2x faster and adding docs * Fixing newlines in docstrings * Fixing newlines in docstrings
This commit is contained in:
parent
cc05d9dad6
commit
93640373c7
|
@ -44,8 +44,8 @@ class EntityRuler(object):
|
||||||
**cfg: Other config parameters. If pipeline component is loaded as part
|
**cfg: Other config parameters. If pipeline component is loaded as part
|
||||||
of a model pipeline, this will include all keyword arguments passed
|
of a model pipeline, this will include all keyword arguments passed
|
||||||
to `spacy.load`.
|
to `spacy.load`.
|
||||||
RETURNS (EntityRuler): The newly constructed object.
|
|
||||||
|
|
||||||
|
RETURNS (EntityRuler): The newly constructed object.
|
||||||
DOCS: https://spacy.io/api/entityruler#init
|
DOCS: https://spacy.io/api/entityruler#init
|
||||||
"""
|
"""
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
|
@ -64,6 +64,7 @@ class EntityRuler(object):
|
||||||
self.phrase_matcher_attr = None
|
self.phrase_matcher_attr = None
|
||||||
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
|
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
|
||||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||||
|
self._ent_ids = defaultdict(dict)
|
||||||
patterns = cfg.get("patterns")
|
patterns = cfg.get("patterns")
|
||||||
if patterns is not None:
|
if patterns is not None:
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
|
@ -82,8 +83,8 @@ class EntityRuler(object):
|
||||||
"""Find matches in document and add them as entities.
|
"""Find matches in document and add them as entities.
|
||||||
|
|
||||||
doc (Doc): The Doc object in the pipeline.
|
doc (Doc): The Doc object in the pipeline.
|
||||||
RETURNS (Doc): The Doc with added entities, if available.
|
|
||||||
|
|
||||||
|
RETURNS (Doc): The Doc with added entities, if available.
|
||||||
DOCS: https://spacy.io/api/entityruler#call
|
DOCS: https://spacy.io/api/entityruler#call
|
||||||
"""
|
"""
|
||||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||||
|
@ -100,10 +101,9 @@ class EntityRuler(object):
|
||||||
continue
|
continue
|
||||||
# check for end - 1 here because boundaries are inclusive
|
# check for end - 1 here because boundaries are inclusive
|
||||||
if start not in seen_tokens and end - 1 not in seen_tokens:
|
if start not in seen_tokens and end - 1 not in seen_tokens:
|
||||||
if self.ent_ids:
|
if match_id in self._ent_ids:
|
||||||
label_ = self.nlp.vocab.strings[match_id]
|
label, ent_id = self._ent_ids[match_id]
|
||||||
ent_label, ent_id = self._split_label(label_)
|
span = Span(doc, start, end, label=label)
|
||||||
span = Span(doc, start, end, label=ent_label)
|
|
||||||
if ent_id:
|
if ent_id:
|
||||||
for token in span:
|
for token in span:
|
||||||
token.ent_id_ = ent_id
|
token.ent_id_ = ent_id
|
||||||
|
@ -122,7 +122,6 @@ class EntityRuler(object):
|
||||||
"""All labels present in the match patterns.
|
"""All labels present in the match patterns.
|
||||||
|
|
||||||
RETURNS (set): The string labels.
|
RETURNS (set): The string labels.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#labels
|
DOCS: https://spacy.io/api/entityruler#labels
|
||||||
"""
|
"""
|
||||||
all_labels = set(self.token_patterns.keys())
|
all_labels = set(self.token_patterns.keys())
|
||||||
|
@ -131,11 +130,10 @@ class EntityRuler(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ent_ids(self):
|
def ent_ids(self):
|
||||||
"""All entity ids present in the match patterns meta dicts.
|
"""All entity ids present in the match patterns `id` properties
|
||||||
|
|
||||||
RETURNS (set): The string entity ids.
|
RETURNS (set): The string entity ids.
|
||||||
|
DOCS: https://spacy.io/api/entityruler#ent_ids
|
||||||
DOCS: https://spacy.io/api/entityruler#labels
|
|
||||||
"""
|
"""
|
||||||
all_ent_ids = set()
|
all_ent_ids = set()
|
||||||
for l in self.labels:
|
for l in self.labels:
|
||||||
|
@ -149,7 +147,6 @@ class EntityRuler(object):
|
||||||
"""Get all patterns that were added to the entity ruler.
|
"""Get all patterns that were added to the entity ruler.
|
||||||
|
|
||||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#patterns
|
DOCS: https://spacy.io/api/entityruler#patterns
|
||||||
"""
|
"""
|
||||||
all_patterns = []
|
all_patterns = []
|
||||||
|
@ -175,7 +172,6 @@ class EntityRuler(object):
|
||||||
pattern (list of dicts) or a phrase pattern (string). For example:
|
pattern (list of dicts) or a phrase pattern (string). For example:
|
||||||
{'label': 'ORG', 'pattern': 'Apple'}
|
{'label': 'ORG', 'pattern': 'Apple'}
|
||||||
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
|
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
|
||||||
|
|
||||||
patterns (list): The patterns to add.
|
patterns (list): The patterns to add.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#add_patterns
|
DOCS: https://spacy.io/api/entityruler#add_patterns
|
||||||
|
@ -192,7 +188,11 @@ class EntityRuler(object):
|
||||||
for entry in patterns:
|
for entry in patterns:
|
||||||
label = entry["label"]
|
label = entry["label"]
|
||||||
if "id" in entry:
|
if "id" in entry:
|
||||||
|
ent_label = label
|
||||||
label = self._create_label(label, entry["id"])
|
label = self._create_label(label, entry["id"])
|
||||||
|
key = self.matcher._normalize_key(label)
|
||||||
|
self._ent_ids[key] = (ent_label, entry["id"])
|
||||||
|
|
||||||
pattern = entry["pattern"]
|
pattern = entry["pattern"]
|
||||||
if isinstance(pattern, basestring_):
|
if isinstance(pattern, basestring_):
|
||||||
self.phrase_patterns[label].append(self.nlp(pattern))
|
self.phrase_patterns[label].append(self.nlp(pattern))
|
||||||
|
@ -232,8 +232,8 @@ class EntityRuler(object):
|
||||||
|
|
||||||
patterns_bytes (bytes): The bytestring to load.
|
patterns_bytes (bytes): The bytestring to load.
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
|
||||||
|
|
||||||
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
DOCS: https://spacy.io/api/entityruler#from_bytes
|
DOCS: https://spacy.io/api/entityruler#from_bytes
|
||||||
"""
|
"""
|
||||||
cfg = srsly.msgpack_loads(patterns_bytes)
|
cfg = srsly.msgpack_loads(patterns_bytes)
|
||||||
|
@ -254,7 +254,6 @@ class EntityRuler(object):
|
||||||
"""Serialize the entity ruler patterns to a bytestring.
|
"""Serialize the entity ruler patterns to a bytestring.
|
||||||
|
|
||||||
RETURNS (bytes): The serialized patterns.
|
RETURNS (bytes): The serialized patterns.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#to_bytes
|
DOCS: https://spacy.io/api/entityruler#to_bytes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -274,8 +273,8 @@ class EntityRuler(object):
|
||||||
|
|
||||||
path (unicode / Path): The JSONL file to load.
|
path (unicode / Path): The JSONL file to load.
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
|
||||||
|
|
||||||
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
DOCS: https://spacy.io/api/entityruler#from_disk
|
DOCS: https://spacy.io/api/entityruler#from_disk
|
||||||
"""
|
"""
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
|
|
|
@ -202,6 +202,14 @@ All labels present in the match patterns.
|
||||||
| ----------- | ----- | ------------------ |
|
| ----------- | ----- | ------------------ |
|
||||||
| **RETURNS** | tuple | The string labels. |
|
| **RETURNS** | tuple | The string labels. |
|
||||||
|
|
||||||
|
## EntityRuler.ent_ids {#labels tag="property"}
|
||||||
|
|
||||||
|
All entity ids present in the match patterns `id` properties
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----- | ------------------ |
|
||||||
|
| **RETURNS** | tuple | The string ent_ids. |
|
||||||
|
|
||||||
## EntityRuler.patterns {#patterns tag="property"}
|
## EntityRuler.patterns {#patterns tag="property"}
|
||||||
|
|
||||||
Get all patterns that were added to the entity ruler.
|
Get all patterns that were added to the entity ruler.
|
||||||
|
|
|
@ -986,6 +986,33 @@ doc = nlp("Apple is opening its first big office in San Francisco.")
|
||||||
print([(ent.text, ent.label_) for ent in doc.ents])
|
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Adding ids to patterns {#entityruler-ent-ids}
|
||||||
|
|
||||||
|
The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each pattern. Using the `id` attribute allows multiple patterns to be associated with the same entity.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### {executable="true"}
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
|
||||||
|
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
|
||||||
|
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
|
||||||
|
doc1 = nlp("Apple is opening its first big office in San Francisco.")
|
||||||
|
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
|
||||||
|
|
||||||
|
doc2 = nlp("Apple is opening its first big office in San Fran.")
|
||||||
|
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
|
||||||
|
```
|
||||||
|
|
||||||
|
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) patterns, the `ent_id_` property of the matched entity is set to the `id` given in the patterns. So in the example above it's easy to identify that "San Francisco" and "San Fran" are both the same entity.
|
||||||
|
|
||||||
|
|
||||||
The entity ruler is designed to integrate with spaCy's existing statistical
|
The entity ruler is designed to integrate with spaCy's existing statistical
|
||||||
models and enhance the named entity recognizer. If it's added **before the
|
models and enhance the named entity recognizer. If it's added **before the
|
||||||
`"ner"` component**, the entity recognizer will respect the existing entity
|
`"ner"` component**, the entity recognizer will respect the existing entity
|
||||||
|
|
Loading…
Reference in New Issue
Block a user