Use nlp.pipe in EntityRuler for phrase patterns in add_patterns (#4931)

* Fix ent_ids and labels properties when id attribute used in patterns

* use set for labels

* sort end_ids for comparison in entity_ruler tests

* fixing entity_ruler ent_ids test

* add to set

* Run make_doc optimistically if using phrase matcher patterns.

* remove unused coveragerc I was testing with

* format

* Refactor EntityRuler.add_patterns to use nlp.pipe for phrase patterns. Improves speed substantially.

* Removing old add_patterns function

* Fixing spacing

* Make sure token_patterns loaded as well, before generator was being emptied in from_disk
This commit is contained in:
Kabir Khan 2020-02-16 09:17:47 -08:00 committed by GitHub
parent 72c964bcf4
commit f6ed07b85c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 65 additions and 3 deletions

View File

@ -8,7 +8,7 @@ from ..language import component
from ..errors import Errors from ..errors import Errors
from ..compat import basestring_ from ..compat import basestring_
from ..util import ensure_path, to_disk, from_disk from ..util import ensure_path, to_disk, from_disk
from ..tokens import Span from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
DEFAULT_ENT_ID_SEP = "||" DEFAULT_ENT_ID_SEP = "||"
@ -162,6 +162,7 @@ class EntityRuler(object):
@property @property
def patterns(self): def patterns(self):
"""Get all patterns that were added to the entity ruler. """Get all patterns that were added to the entity ruler.
RETURNS (list): The original patterns, one dictionary per pattern. RETURNS (list): The original patterns, one dictionary per pattern.
DOCS: https://spacy.io/api/entityruler#patterns DOCS: https://spacy.io/api/entityruler#patterns
@ -194,6 +195,7 @@ class EntityRuler(object):
DOCS: https://spacy.io/api/entityruler#add_patterns DOCS: https://spacy.io/api/entityruler#add_patterns
""" """
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
try: try:
current_index = self.nlp.pipe_names.index(self.name) current_index = self.nlp.pipe_names.index(self.name)
@ -203,7 +205,33 @@ class EntityRuler(object):
except ValueError: except ValueError:
subsequent_pipes = [] subsequent_pipes = []
with self.nlp.disable_pipes(subsequent_pipes): with self.nlp.disable_pipes(subsequent_pipes):
token_patterns = []
phrase_pattern_labels = []
phrase_pattern_texts = []
phrase_pattern_ids = []
for entry in patterns: for entry in patterns:
if isinstance(entry["pattern"], basestring_):
phrase_pattern_labels.append(entry["label"])
phrase_pattern_texts.append(entry["pattern"])
phrase_pattern_ids.append(entry.get("id"))
elif isinstance(entry["pattern"], list):
token_patterns.append(entry)
phrase_patterns = []
for label, pattern, ent_id in zip(
phrase_pattern_labels,
self.nlp.pipe(phrase_pattern_texts),
phrase_pattern_ids
):
phrase_pattern = {
"label": label, "pattern": pattern, "id": ent_id
}
if ent_id:
phrase_pattern["id"] = ent_id
phrase_patterns.append(phrase_pattern)
for entry in token_patterns + phrase_patterns:
label = entry["label"] label = entry["label"]
if "id" in entry: if "id" in entry:
ent_label = label ent_label = label
@ -212,8 +240,8 @@ class EntityRuler(object):
self._ent_ids[key] = (ent_label, entry["id"]) self._ent_ids[key] = (ent_label, entry["id"])
pattern = entry["pattern"] pattern = entry["pattern"]
if isinstance(pattern, basestring_): if isinstance(pattern, Doc):
self.phrase_patterns[label].append(self.nlp(pattern)) self.phrase_patterns[label].append(pattern)
elif isinstance(pattern, list): elif isinstance(pattern, list):
self.token_patterns[label].append(pattern) self.token_patterns[label].append(pattern)
else: else:
@ -226,6 +254,8 @@ class EntityRuler(object):
def _split_label(self, label): def _split_label(self, label):
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
label (str): The value of label in a pattern entry
RETURNS (tuple): ent_label, ent_id RETURNS (tuple): ent_label, ent_id
""" """
if self.ent_id_sep in label: if self.ent_id_sep in label:
@ -239,6 +269,9 @@ class EntityRuler(object):
def _create_label(self, label, ent_id): def _create_label(self, label, ent_id):
"""Join Entity label with ent_id if the pattern has an `id` attribute """Join Entity label with ent_id if the pattern has an `id` attribute
label (str): The label to set for ent.label_
ent_id (str): The label
RETURNS (str): The ent_label joined with configured `ent_id_sep` RETURNS (str): The ent_label joined with configured `ent_id_sep`
""" """
if isinstance(ent_id, basestring_): if isinstance(ent_id, basestring_):
@ -250,6 +283,7 @@ class EntityRuler(object):
patterns_bytes (bytes): The bytestring to load. patterns_bytes (bytes): The bytestring to load.
**kwargs: Other config paramters, mostly for consistency. **kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler. RETURNS (EntityRuler): The loaded entity ruler.
DOCS: https://spacy.io/api/entityruler#from_bytes DOCS: https://spacy.io/api/entityruler#from_bytes
@ -292,6 +326,7 @@ class EntityRuler(object):
path (unicode / Path): The JSONL file to load. path (unicode / Path): The JSONL file to load.
**kwargs: Other config paramters, mostly for consistency. **kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler. RETURNS (EntityRuler): The loaded entity ruler.
DOCS: https://spacy.io/api/entityruler#from_disk DOCS: https://spacy.io/api/entityruler#from_disk

View File

@ -1096,6 +1096,33 @@ with the patterns. When you load the model back in, all pipeline components will
be restored and deserialized including the entity ruler. This lets you ship be restored and deserialized including the entity ruler. This lets you ship
powerful model packages with binary weights _and_ rules included! powerful model packages with binary weights _and_ rules included!
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to
extract matches based on the pattern's POS signature.
In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively.
Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
An easy workaround to make this function run faster is disabling the other language pipes
while adding the phrase patterns.
```python
entityruler = EntityRuler(nlp)
patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)]
other_pipes = [p for p in nlp.pipe_names if p != "tagger"]
with nlp.disable_pipes(*disable_pipes):
entityruler.add_patterns(patterns)
```
## Combining models and rules {#models-rules} ## Combining models and rules {#models-rules}
You can combine statistical and rule-based components in a variety of ways. You can combine statistical and rule-based components in a variety of ways.