mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Use nlp.pipe in EntityRuler for phrase patterns in add_patterns (#4931)
* Fix ent_ids and labels properties when id attribute used in patterns * use set for labels * sort end_ids for comparison in entity_ruler tests * fixing entity_ruler ent_ids test * add to set * Run make_doc optimistically if using phrase matcher patterns. * remove unused coveragerc I was testing with * format * Refactor EntityRuler.add_patterns to use nlp.pipe for phrase patterns. Improves speed substantially. * Removing old add_patterns function * Fixing spacing * Make sure token_patterns loaded as well, before generator was being emptied in from_disk
This commit is contained in:
parent
72c964bcf4
commit
f6ed07b85c
|
@ -8,7 +8,7 @@ from ..language import component
|
|||
from ..errors import Errors
|
||||
from ..compat import basestring_
|
||||
from ..util import ensure_path, to_disk, from_disk
|
||||
from ..tokens import Span
|
||||
from ..tokens import Doc, Span
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
|
||||
DEFAULT_ENT_ID_SEP = "||"
|
||||
|
@ -162,6 +162,7 @@ class EntityRuler(object):
|
|||
@property
|
||||
def patterns(self):
|
||||
"""Get all patterns that were added to the entity ruler.
|
||||
|
||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#patterns
|
||||
|
@ -194,6 +195,7 @@ class EntityRuler(object):
|
|||
|
||||
DOCS: https://spacy.io/api/entityruler#add_patterns
|
||||
"""
|
||||
|
||||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||
try:
|
||||
current_index = self.nlp.pipe_names.index(self.name)
|
||||
|
@ -203,7 +205,33 @@ class EntityRuler(object):
|
|||
except ValueError:
|
||||
subsequent_pipes = []
|
||||
with self.nlp.disable_pipes(subsequent_pipes):
|
||||
token_patterns = []
|
||||
phrase_pattern_labels = []
|
||||
phrase_pattern_texts = []
|
||||
phrase_pattern_ids = []
|
||||
|
||||
for entry in patterns:
|
||||
if isinstance(entry["pattern"], basestring_):
|
||||
phrase_pattern_labels.append(entry["label"])
|
||||
phrase_pattern_texts.append(entry["pattern"])
|
||||
phrase_pattern_ids.append(entry.get("id"))
|
||||
elif isinstance(entry["pattern"], list):
|
||||
token_patterns.append(entry)
|
||||
|
||||
phrase_patterns = []
|
||||
for label, pattern, ent_id in zip(
|
||||
phrase_pattern_labels,
|
||||
self.nlp.pipe(phrase_pattern_texts),
|
||||
phrase_pattern_ids
|
||||
):
|
||||
phrase_pattern = {
|
||||
"label": label, "pattern": pattern, "id": ent_id
|
||||
}
|
||||
if ent_id:
|
||||
phrase_pattern["id"] = ent_id
|
||||
phrase_patterns.append(phrase_pattern)
|
||||
|
||||
for entry in token_patterns + phrase_patterns:
|
||||
label = entry["label"]
|
||||
if "id" in entry:
|
||||
ent_label = label
|
||||
|
@ -212,8 +240,8 @@ class EntityRuler(object):
|
|||
self._ent_ids[key] = (ent_label, entry["id"])
|
||||
|
||||
pattern = entry["pattern"]
|
||||
if isinstance(pattern, basestring_):
|
||||
self.phrase_patterns[label].append(self.nlp(pattern))
|
||||
if isinstance(pattern, Doc):
|
||||
self.phrase_patterns[label].append(pattern)
|
||||
elif isinstance(pattern, list):
|
||||
self.token_patterns[label].append(pattern)
|
||||
else:
|
||||
|
@ -226,6 +254,8 @@ class EntityRuler(object):
|
|||
def _split_label(self, label):
|
||||
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
||||
|
||||
label (str): The value of label in a pattern entry
|
||||
|
||||
RETURNS (tuple): ent_label, ent_id
|
||||
"""
|
||||
if self.ent_id_sep in label:
|
||||
|
@ -239,6 +269,9 @@ class EntityRuler(object):
|
|||
def _create_label(self, label, ent_id):
|
||||
"""Join Entity label with ent_id if the pattern has an `id` attribute
|
||||
|
||||
label (str): The label to set for ent.label_
|
||||
ent_id (str): The label
|
||||
|
||||
RETURNS (str): The ent_label joined with configured `ent_id_sep`
|
||||
"""
|
||||
if isinstance(ent_id, basestring_):
|
||||
|
@ -250,6 +283,7 @@ class EntityRuler(object):
|
|||
|
||||
patterns_bytes (bytes): The bytestring to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#from_bytes
|
||||
|
@ -292,6 +326,7 @@ class EntityRuler(object):
|
|||
|
||||
path (unicode / Path): The JSONL file to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#from_disk
|
||||
|
|
|
@ -1096,6 +1096,33 @@ with the patterns. When you load the model back in, all pipeline components will
|
|||
be restored and deserialized – including the entity ruler. This lets you ship
|
||||
powerful model packages with binary weights _and_ rules included!
|
||||
|
||||
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
|
||||
|
||||
When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
|
||||
the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
|
||||
to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to
|
||||
extract matches based on the pattern's POS signature.
|
||||
|
||||
In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
|
||||
|
||||
Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
|
||||
|
||||
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively.
|
||||
|
||||
Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
|
||||
|
||||
An easy workaround to make this function run faster is disabling the other language pipes
|
||||
while adding the phrase patterns.
|
||||
|
||||
```python
|
||||
entityruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)]
|
||||
|
||||
other_pipes = [p for p in nlp.pipe_names if p != "tagger"]
|
||||
with nlp.disable_pipes(*disable_pipes):
|
||||
entityruler.add_patterns(patterns)
|
||||
```
|
||||
|
||||
## Combining models and rules {#models-rules}
|
||||
|
||||
You can combine statistical and rule-based components in a variety of ways.
|
||||
|
|
Loading…
Reference in New Issue
Block a user