mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Use nlp.pipe in EntityRuler for phrase patterns in add_patterns (#4931)
* Fix ent_ids and labels properties when id attribute used in patterns * use set for labels * sort end_ids for comparison in entity_ruler tests * fixing entity_ruler ent_ids test * add to set * Run make_doc optimistically if using phrase matcher patterns. * remove unused coveragerc I was testing with * format * Refactor EntityRuler.add_patterns to use nlp.pipe for phrase patterns. Improves speed substantially. * Removing old add_patterns function * Fixing spacing * Make sure token_patterns loaded as well, before generator was being emptied in from_disk
This commit is contained in:
parent
72c964bcf4
commit
f6ed07b85c
|
@ -8,7 +8,7 @@ from ..language import component
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..compat import basestring_
|
from ..compat import basestring_
|
||||||
from ..util import ensure_path, to_disk, from_disk
|
from ..util import ensure_path, to_disk, from_disk
|
||||||
from ..tokens import Span
|
from ..tokens import Doc, Span
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
|
|
||||||
DEFAULT_ENT_ID_SEP = "||"
|
DEFAULT_ENT_ID_SEP = "||"
|
||||||
|
@ -162,6 +162,7 @@ class EntityRuler(object):
|
||||||
@property
|
@property
|
||||||
def patterns(self):
|
def patterns(self):
|
||||||
"""Get all patterns that were added to the entity ruler.
|
"""Get all patterns that were added to the entity ruler.
|
||||||
|
|
||||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#patterns
|
DOCS: https://spacy.io/api/entityruler#patterns
|
||||||
|
@ -194,6 +195,7 @@ class EntityRuler(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#add_patterns
|
DOCS: https://spacy.io/api/entityruler#add_patterns
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||||
try:
|
try:
|
||||||
current_index = self.nlp.pipe_names.index(self.name)
|
current_index = self.nlp.pipe_names.index(self.name)
|
||||||
|
@ -203,7 +205,33 @@ class EntityRuler(object):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
subsequent_pipes = []
|
subsequent_pipes = []
|
||||||
with self.nlp.disable_pipes(subsequent_pipes):
|
with self.nlp.disable_pipes(subsequent_pipes):
|
||||||
|
token_patterns = []
|
||||||
|
phrase_pattern_labels = []
|
||||||
|
phrase_pattern_texts = []
|
||||||
|
phrase_pattern_ids = []
|
||||||
|
|
||||||
for entry in patterns:
|
for entry in patterns:
|
||||||
|
if isinstance(entry["pattern"], basestring_):
|
||||||
|
phrase_pattern_labels.append(entry["label"])
|
||||||
|
phrase_pattern_texts.append(entry["pattern"])
|
||||||
|
phrase_pattern_ids.append(entry.get("id"))
|
||||||
|
elif isinstance(entry["pattern"], list):
|
||||||
|
token_patterns.append(entry)
|
||||||
|
|
||||||
|
phrase_patterns = []
|
||||||
|
for label, pattern, ent_id in zip(
|
||||||
|
phrase_pattern_labels,
|
||||||
|
self.nlp.pipe(phrase_pattern_texts),
|
||||||
|
phrase_pattern_ids
|
||||||
|
):
|
||||||
|
phrase_pattern = {
|
||||||
|
"label": label, "pattern": pattern, "id": ent_id
|
||||||
|
}
|
||||||
|
if ent_id:
|
||||||
|
phrase_pattern["id"] = ent_id
|
||||||
|
phrase_patterns.append(phrase_pattern)
|
||||||
|
|
||||||
|
for entry in token_patterns + phrase_patterns:
|
||||||
label = entry["label"]
|
label = entry["label"]
|
||||||
if "id" in entry:
|
if "id" in entry:
|
||||||
ent_label = label
|
ent_label = label
|
||||||
|
@ -212,8 +240,8 @@ class EntityRuler(object):
|
||||||
self._ent_ids[key] = (ent_label, entry["id"])
|
self._ent_ids[key] = (ent_label, entry["id"])
|
||||||
|
|
||||||
pattern = entry["pattern"]
|
pattern = entry["pattern"]
|
||||||
if isinstance(pattern, basestring_):
|
if isinstance(pattern, Doc):
|
||||||
self.phrase_patterns[label].append(self.nlp(pattern))
|
self.phrase_patterns[label].append(pattern)
|
||||||
elif isinstance(pattern, list):
|
elif isinstance(pattern, list):
|
||||||
self.token_patterns[label].append(pattern)
|
self.token_patterns[label].append(pattern)
|
||||||
else:
|
else:
|
||||||
|
@ -226,6 +254,8 @@ class EntityRuler(object):
|
||||||
def _split_label(self, label):
|
def _split_label(self, label):
|
||||||
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
||||||
|
|
||||||
|
label (str): The value of label in a pattern entry
|
||||||
|
|
||||||
RETURNS (tuple): ent_label, ent_id
|
RETURNS (tuple): ent_label, ent_id
|
||||||
"""
|
"""
|
||||||
if self.ent_id_sep in label:
|
if self.ent_id_sep in label:
|
||||||
|
@ -239,6 +269,9 @@ class EntityRuler(object):
|
||||||
def _create_label(self, label, ent_id):
|
def _create_label(self, label, ent_id):
|
||||||
"""Join Entity label with ent_id if the pattern has an `id` attribute
|
"""Join Entity label with ent_id if the pattern has an `id` attribute
|
||||||
|
|
||||||
|
label (str): The label to set for ent.label_
|
||||||
|
ent_id (str): The label
|
||||||
|
|
||||||
RETURNS (str): The ent_label joined with configured `ent_id_sep`
|
RETURNS (str): The ent_label joined with configured `ent_id_sep`
|
||||||
"""
|
"""
|
||||||
if isinstance(ent_id, basestring_):
|
if isinstance(ent_id, basestring_):
|
||||||
|
@ -250,6 +283,7 @@ class EntityRuler(object):
|
||||||
|
|
||||||
patterns_bytes (bytes): The bytestring to load.
|
patterns_bytes (bytes): The bytestring to load.
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
|
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#from_bytes
|
DOCS: https://spacy.io/api/entityruler#from_bytes
|
||||||
|
@ -292,6 +326,7 @@ class EntityRuler(object):
|
||||||
|
|
||||||
path (unicode / Path): The JSONL file to load.
|
path (unicode / Path): The JSONL file to load.
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
|
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityruler#from_disk
|
DOCS: https://spacy.io/api/entityruler#from_disk
|
||||||
|
|
|
@ -1096,6 +1096,33 @@ with the patterns. When you load the model back in, all pipeline components will
|
||||||
be restored and deserialized – including the entity ruler. This lets you ship
|
be restored and deserialized – including the entity ruler. This lets you ship
|
||||||
powerful model packages with binary weights _and_ rules included!
|
powerful model packages with binary weights _and_ rules included!
|
||||||
|
|
||||||
|
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
|
||||||
|
|
||||||
|
When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
|
||||||
|
the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
|
||||||
|
to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to
|
||||||
|
extract matches based on the pattern's POS signature.
|
||||||
|
|
||||||
|
In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
|
||||||
|
|
||||||
|
Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
|
||||||
|
|
||||||
|
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively.
|
||||||
|
|
||||||
|
Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
|
||||||
|
|
||||||
|
An easy workaround to make this function run faster is disabling the other language pipes
|
||||||
|
while adding the phrase patterns.
|
||||||
|
|
||||||
|
```python
|
||||||
|
entityruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "TEST", "pattern": str(i)} for i in range(100000)]
|
||||||
|
|
||||||
|
other_pipes = [p for p in nlp.pipe_names if p != "tagger"]
|
||||||
|
with nlp.disable_pipes(*disable_pipes):
|
||||||
|
entityruler.add_patterns(patterns)
|
||||||
|
```
|
||||||
|
|
||||||
## Combining models and rules {#models-rules}
|
## Combining models and rules {#models-rules}
|
||||||
|
|
||||||
You can combine statistical and rule-based components in a variety of ways.
|
You can combine statistical and rule-based components in a variety of ways.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user