mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Make entity_ruler ent_id resolution 2x faster and add docs for… (#4513)
* Update entityruler.py * Making ent_id resolution 2x faster and adding docs * Fixing newlines in docstrings * Fixing newlines in docstrings
This commit is contained in:
parent
cc05d9dad6
commit
93640373c7
|
@ -44,8 +44,8 @@ class EntityRuler(object):
|
|||
**cfg: Other config parameters. If pipeline component is loaded as part
|
||||
of a model pipeline, this will include all keyword arguments passed
|
||||
to `spacy.load`.
|
||||
RETURNS (EntityRuler): The newly constructed object.
|
||||
|
||||
RETURNS (EntityRuler): The newly constructed object.
|
||||
DOCS: https://spacy.io/api/entityruler#init
|
||||
"""
|
||||
self.nlp = nlp
|
||||
|
@ -64,6 +64,7 @@ class EntityRuler(object):
|
|||
self.phrase_matcher_attr = None
|
||||
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
|
||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||
self._ent_ids = defaultdict(dict)
|
||||
patterns = cfg.get("patterns")
|
||||
if patterns is not None:
|
||||
self.add_patterns(patterns)
|
||||
|
@ -82,8 +83,8 @@ class EntityRuler(object):
|
|||
"""Find matches in document and add them as entities.
|
||||
|
||||
doc (Doc): The Doc object in the pipeline.
|
||||
RETURNS (Doc): The Doc with added entities, if available.
|
||||
|
||||
RETURNS (Doc): The Doc with added entities, if available.
|
||||
DOCS: https://spacy.io/api/entityruler#call
|
||||
"""
|
||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||
|
@ -100,10 +101,9 @@ class EntityRuler(object):
|
|||
continue
|
||||
# check for end - 1 here because boundaries are inclusive
|
||||
if start not in seen_tokens and end - 1 not in seen_tokens:
|
||||
if self.ent_ids:
|
||||
label_ = self.nlp.vocab.strings[match_id]
|
||||
ent_label, ent_id = self._split_label(label_)
|
||||
span = Span(doc, start, end, label=ent_label)
|
||||
if match_id in self._ent_ids:
|
||||
label, ent_id = self._ent_ids[match_id]
|
||||
span = Span(doc, start, end, label=label)
|
||||
if ent_id:
|
||||
for token in span:
|
||||
token.ent_id_ = ent_id
|
||||
|
@ -122,7 +122,6 @@ class EntityRuler(object):
|
|||
"""All labels present in the match patterns.
|
||||
|
||||
RETURNS (set): The string labels.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#labels
|
||||
"""
|
||||
all_labels = set(self.token_patterns.keys())
|
||||
|
@ -131,11 +130,10 @@ class EntityRuler(object):
|
|||
|
||||
@property
|
||||
def ent_ids(self):
|
||||
"""All entity ids present in the match patterns meta dicts.
|
||||
"""All entity ids present in the match patterns `id` properties
|
||||
|
||||
RETURNS (set): The string entity ids.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#labels
|
||||
DOCS: https://spacy.io/api/entityruler#ent_ids
|
||||
"""
|
||||
all_ent_ids = set()
|
||||
for l in self.labels:
|
||||
|
@ -149,7 +147,6 @@ class EntityRuler(object):
|
|||
"""Get all patterns that were added to the entity ruler.
|
||||
|
||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#patterns
|
||||
"""
|
||||
all_patterns = []
|
||||
|
@ -175,7 +172,6 @@ class EntityRuler(object):
|
|||
pattern (list of dicts) or a phrase pattern (string). For example:
|
||||
{'label': 'ORG', 'pattern': 'Apple'}
|
||||
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
|
||||
|
||||
patterns (list): The patterns to add.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#add_patterns
|
||||
|
@ -192,7 +188,11 @@ class EntityRuler(object):
|
|||
for entry in patterns:
|
||||
label = entry["label"]
|
||||
if "id" in entry:
|
||||
ent_label = label
|
||||
label = self._create_label(label, entry["id"])
|
||||
key = self.matcher._normalize_key(label)
|
||||
self._ent_ids[key] = (ent_label, entry["id"])
|
||||
|
||||
pattern = entry["pattern"]
|
||||
if isinstance(pattern, basestring_):
|
||||
self.phrase_patterns[label].append(self.nlp(pattern))
|
||||
|
@ -232,8 +232,8 @@ class EntityRuler(object):
|
|||
|
||||
patterns_bytes (bytes): The bytestring to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
DOCS: https://spacy.io/api/entityruler#from_bytes
|
||||
"""
|
||||
cfg = srsly.msgpack_loads(patterns_bytes)
|
||||
|
@ -254,7 +254,6 @@ class EntityRuler(object):
|
|||
"""Serialize the entity ruler patterns to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized patterns.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#to_bytes
|
||||
"""
|
||||
|
||||
|
@ -274,8 +273,8 @@ class EntityRuler(object):
|
|||
|
||||
path (unicode / Path): The JSONL file to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
DOCS: https://spacy.io/api/entityruler#from_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
|
|
|
@ -202,6 +202,14 @@ All labels present in the match patterns.
|
|||
| ----------- | ----- | ------------------ |
|
||||
| **RETURNS** | tuple | The string labels. |
|
||||
|
||||
## EntityRuler.ent_ids {#labels tag="property"}
|
||||
|
||||
All entity ids present in the match patterns `id` properties
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------ |
|
||||
| **RETURNS** | tuple | The string ent_ids. |
|
||||
|
||||
## EntityRuler.patterns {#patterns tag="property"}
|
||||
|
||||
Get all patterns that were added to the entity ruler.
|
||||
|
|
|
@ -986,6 +986,33 @@ doc = nlp("Apple is opening its first big office in San Francisco.")
|
|||
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||
```
|
||||
|
||||
### Adding ids to patterns {#entityruler-ent-ids}
|
||||
|
||||
The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each pattern. Using the `id` attribute allows multiple patterns to be associated with the same entity.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
|
||||
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
|
||||
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
doc1 = nlp("Apple is opening its first big office in San Francisco.")
|
||||
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
|
||||
|
||||
doc2 = nlp("Apple is opening its first big office in San Fran.")
|
||||
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
|
||||
```
|
||||
|
||||
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) patterns, the `ent_id_` property of the matched entity is set to the `id` given in the patterns. So in the example above it's easy to identify that "San Francisco" and "San Fran" are both the same entity.
|
||||
|
||||
|
||||
The entity ruler is designed to integrate with spaCy's existing statistical
|
||||
models and enhance the named entity recognizer. If it's added **before the
|
||||
`"ner"` component**, the entity recognizer will respect the existing entity
|
||||
|
|
Loading…
Reference in New Issue
Block a user