Replace EntityRuler with SpanRuler implementation (#11320)

* Replace EntityRuler with SpanRuler implementation

Remove `EntityRuler` and rename the `SpanRuler`-based
`future_entity_ruler` to `entity_ruler`.

Main changes:

* It is no longer possible to load patterns on init as with
`EntityRuler(patterns=)`.
* The older serialization formats (`patterns.jsonl`) are no longer
supported and the related tests are removed.
* The config settings are only stored in the config, not in the
serialized component (in particular the `phrase_matcher_attr` and
overwrite settings).

* Add migration guide to EntityRuler API docs

* docs update

* Minor edit

Co-authored-by: svlandeg <svlandeg@github.com>
This commit is contained in:
Adriane Boyd 2022-10-24 09:11:35 +02:00 committed by GitHub
parent a4bd890f32
commit cae4589f5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 233 additions and 1072 deletions

View File

@ -460,13 +460,13 @@ class Errors(metaclass=ErrorsWithCodes):
"same, but found '{nlp}' and '{vocab}' respectively.") "same, but found '{nlp}' and '{vocab}' respectively.")
E152 = ("The attribute {attr} is not supported for token patterns. " E152 = ("The attribute {attr} is not supported for token patterns. "
"Please use the option `validate=True` with the Matcher, PhraseMatcher, " "Please use the option `validate=True` with the Matcher, PhraseMatcher, "
"EntityRuler or AttributeRuler for more details.") "SpanRuler or AttributeRuler for more details.")
E153 = ("The value type {vtype} is not supported for token patterns. " E153 = ("The value type {vtype} is not supported for token patterns. "
"Please use the option validate=True with Matcher, PhraseMatcher, " "Please use the option validate=True with Matcher, PhraseMatcher, "
"EntityRuler or AttributeRuler for more details.") "SpanRuler or AttributeRuler for more details.")
E154 = ("One of the attributes or values is not supported for token " E154 = ("One of the attributes or values is not supported for token "
"patterns. Please use the option `validate=True` with the Matcher, " "patterns. Please use the option `validate=True` with the Matcher, "
"PhraseMatcher, or EntityRuler for more details.") "PhraseMatcher, or SpanRuler for more details.")
E155 = ("The pipeline needs to include a {pipe} in order to use " E155 = ("The pipeline needs to include a {pipe} in order to use "
"Matcher or PhraseMatcher with the attribute {attr}. " "Matcher or PhraseMatcher with the attribute {attr}. "
"Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` " "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
@ -917,8 +917,6 @@ class Errors(metaclass=ErrorsWithCodes):
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
"Non-UD tags should use the `tag` property.") "Non-UD tags should use the `tag` property.")
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
"exist.")
E1024 = ("A pattern with {attr_type} '{label}' is not present in " E1024 = ("A pattern with {attr_type} '{label}' is not present in "
"'{component}' patterns.") "'{component}' patterns.")
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only " E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "

View File

@ -3,7 +3,6 @@ from .dep_parser import DependencyParser
from .edit_tree_lemmatizer import EditTreeLemmatizer from .edit_tree_lemmatizer import EditTreeLemmatizer
from .entity_linker import EntityLinker from .entity_linker import EntityLinker
from .ner import EntityRecognizer from .ner import EntityRecognizer
from .entity_ruler import EntityRuler
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .morphologizer import Morphologizer from .morphologizer import Morphologizer
from .pipe import Pipe from .pipe import Pipe
@ -23,7 +22,6 @@ __all__ = [
"DependencyParser", "DependencyParser",
"EntityLinker", "EntityLinker",
"EntityRecognizer", "EntityRecognizer",
"EntityRuler",
"Morphologizer", "Morphologizer",
"Lemmatizer", "Lemmatizer",
"MultiLabel_TextCategorizer", "MultiLabel_TextCategorizer",

View File

@ -1,525 +0,0 @@
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
import warnings
from collections import defaultdict
from pathlib import Path
import srsly
from .pipe import Pipe
from ..training import Example
from ..language import Language
from ..errors import Errors, Warnings
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher
from ..scorer import get_ner_prf
DEFAULT_ENT_ID_SEP = "||"
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
@Language.factory(
"entity_ruler",
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
default_config={
"phrase_matcher_attr": None,
"validate": False,
"overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)
def make_entity_ruler(
nlp: Language,
name: str,
phrase_matcher_attr: Optional[Union[int, str]],
validate: bool,
overwrite_ents: bool,
ent_id_sep: str,
scorer: Optional[Callable],
):
return EntityRuler(
nlp,
name,
phrase_matcher_attr=phrase_matcher_attr,
validate=validate,
overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep,
scorer=scorer,
)
def entity_ruler_score(examples, **kwargs):
return get_ner_prf(examples)
@registry.scorers("spacy.entity_ruler_scorer.v1")
def make_entity_ruler_scorer():
return entity_ruler_score
class EntityRuler(Pipe):
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
rules or exact phrase matches. It can be combined with the statistical
`EntityRecognizer` to boost accuracy, or used on its own to implement a
purely rule-based entity recognition system. After initialization, the
component is typically added to the pipeline using `nlp.add_pipe`.
DOCS: https://spacy.io/api/entityruler
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
"""
def __init__(
self,
nlp: Language,
name: str = "entity_ruler",
*,
phrase_matcher_attr: Optional[Union[int, str]] = None,
validate: bool = False,
overwrite_ents: bool = False,
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
patterns: Optional[List[PatternType]] = None,
scorer: Optional[Callable] = entity_ruler_score,
) -> None:
"""Initialize the entity ruler. If patterns are supplied here, they
need to be a list of dictionaries with a `"label"` and `"pattern"`
key. A pattern can either be a token pattern (list) or a phrase pattern
(string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
nlp (Language): The shared nlp object to pass the vocab to the matchers
and process phrase patterns.
name (str): Instance name of the current pipeline component. Typically
passed in automatically from the factory when the component is
added. Used to disable the current entity ruler while creating
phrase patterns with the nlp object.
phrase_matcher_attr (int / str): Token attribute to match on, passed
to the internal PhraseMatcher as `attr`
validate (bool): Whether patterns should be validated, passed to
Matcher and PhraseMatcher as `validate`
patterns (iterable): Optional patterns to load in.
overwrite_ents (bool): If existing entities are present, e.g. entities
added by the model, overwrite them by matches if necessary.
ent_id_sep (str): Separator used internally for entity IDs.
scorer (Optional[Callable]): The scoring method. Defaults to
spacy.scorer.get_ner_prf.
DOCS: https://spacy.io/api/entityruler#init
"""
self.nlp = nlp
self.name = name
self.overwrite = overwrite_ents
self.token_patterns = defaultdict(list) # type: ignore
self.phrase_patterns = defaultdict(list) # type: ignore
self._validate = validate
self.matcher = Matcher(nlp.vocab, validate=validate)
self.phrase_matcher_attr = phrase_matcher_attr
self.phrase_matcher = PhraseMatcher(
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
)
self.ent_id_sep = ent_id_sep
self._ent_ids = defaultdict(tuple) # type: ignore
if patterns is not None:
self.add_patterns(patterns)
self.scorer = scorer
def __len__(self) -> int:
"""The number of all patterns added to the entity ruler."""
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
return n_token_patterns + n_phrase_patterns
def __contains__(self, label: str) -> bool:
"""Whether a label is present in the patterns."""
return label in self.token_patterns or label in self.phrase_patterns
def __call__(self, doc: Doc) -> Doc:
"""Find matches in document and add them as entities.
doc (Doc): The Doc object in the pipeline.
RETURNS (Doc): The Doc with added entities, if available.
DOCS: https://spacy.io/api/entityruler#call
"""
error_handler = self.get_error_handler()
try:
matches = self.match(doc)
self.set_annotations(doc, matches)
return doc
except Exception as e:
return error_handler(self.name, self, [doc], e)
def match(self, doc: Doc):
self._require_patterns()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W036")
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
final_matches = set(
[(m_id, start, end) for m_id, start, end in matches if start != end]
)
get_sort_key = lambda m: (m[2] - m[1], -m[1])
final_matches = sorted(final_matches, key=get_sort_key, reverse=True)
return final_matches
def set_annotations(self, doc, matches):
"""Modify the document in place"""
entities = list(doc.ents)
new_entities = []
seen_tokens = set()
for match_id, start, end in matches:
if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
continue
# check for end - 1 here because boundaries are inclusive
if start not in seen_tokens and end - 1 not in seen_tokens:
if match_id in self._ent_ids:
label, ent_id = self._ent_ids[match_id]
span = Span(doc, start, end, label=label, span_id=ent_id)
else:
span = Span(doc, start, end, label=match_id)
new_entities.append(span)
entities = [
e for e in entities if not (e.start < end and e.end > start)
]
seen_tokens.update(range(start, end))
doc.ents = entities + new_entities
@property
def labels(self) -> Tuple[str, ...]:
"""All labels present in the match patterns.
RETURNS (set): The string labels.
DOCS: https://spacy.io/api/entityruler#labels
"""
keys = set(self.token_patterns.keys())
keys.update(self.phrase_patterns.keys())
all_labels = set()
for l in keys:
if self.ent_id_sep in l:
label, _ = self._split_label(l)
all_labels.add(label)
else:
all_labels.add(l)
return tuple(sorted(all_labels))
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
patterns: Optional[Sequence[PatternType]] = None,
):
"""Initialize the pipe for training.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
patterns Optional[Iterable[PatternType]]: The list of patterns.
DOCS: https://spacy.io/api/entityruler#initialize
"""
self.clear()
if patterns:
self.add_patterns(patterns) # type: ignore[arg-type]
@property
def ent_ids(self) -> Tuple[Optional[str], ...]:
"""All entity ids present in the match patterns `id` properties
RETURNS (set): The string entity ids.
DOCS: https://spacy.io/api/entityruler#ent_ids
"""
keys = set(self.token_patterns.keys())
keys.update(self.phrase_patterns.keys())
all_ent_ids = set()
for l in keys:
if self.ent_id_sep in l:
_, ent_id = self._split_label(l)
all_ent_ids.add(ent_id)
return tuple(all_ent_ids)
@property
def patterns(self) -> List[PatternType]:
"""Get all patterns that were added to the entity ruler.
RETURNS (list): The original patterns, one dictionary per pattern.
DOCS: https://spacy.io/api/entityruler#patterns
"""
all_patterns = []
for label, patterns in self.token_patterns.items():
for pattern in patterns:
ent_label, ent_id = self._split_label(label)
p = {"label": ent_label, "pattern": pattern}
if ent_id:
p["id"] = ent_id
all_patterns.append(p)
for label, patterns in self.phrase_patterns.items():
for pattern in patterns:
ent_label, ent_id = self._split_label(label)
p = {"label": ent_label, "pattern": pattern.text}
if ent_id:
p["id"] = ent_id
all_patterns.append(p)
return all_patterns
def add_patterns(self, patterns: List[PatternType]) -> None:
"""Add patterns to the entity ruler. A pattern can either be a token
pattern (list of dicts) or a phrase pattern (string). For example:
{'label': 'ORG', 'pattern': 'Apple'}
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
patterns (list): The patterns to add.
DOCS: https://spacy.io/api/entityruler#add_patterns
"""
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
try:
current_index = -1
for i, (name, pipe) in enumerate(self.nlp.pipeline):
if self == pipe:
current_index = i
break
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]]
except ValueError:
subsequent_pipes = []
with self.nlp.select_pipes(disable=subsequent_pipes):
token_patterns = []
phrase_pattern_labels = []
phrase_pattern_texts = []
phrase_pattern_ids = []
for entry in patterns:
if isinstance(entry["pattern"], str):
phrase_pattern_labels.append(entry["label"])
phrase_pattern_texts.append(entry["pattern"])
phrase_pattern_ids.append(entry.get("id"))
elif isinstance(entry["pattern"], list):
token_patterns.append(entry)
phrase_patterns = []
for label, pattern, ent_id in zip(
phrase_pattern_labels,
self.nlp.pipe(phrase_pattern_texts),
phrase_pattern_ids,
):
phrase_pattern = {"label": label, "pattern": pattern}
if ent_id:
phrase_pattern["id"] = ent_id
phrase_patterns.append(phrase_pattern)
for entry in token_patterns + phrase_patterns: # type: ignore[operator]
label = entry["label"] # type: ignore
if "id" in entry:
ent_label = label
label = self._create_label(label, entry["id"])
key = self.matcher._normalize_key(label)
self._ent_ids[key] = (ent_label, entry["id"])
pattern = entry["pattern"] # type: ignore
if isinstance(pattern, Doc):
self.phrase_patterns[label].append(pattern)
self.phrase_matcher.add(label, [pattern]) # type: ignore
elif isinstance(pattern, list):
self.token_patterns[label].append(pattern)
self.matcher.add(label, [pattern])
else:
raise ValueError(Errors.E097.format(pattern=pattern))
def clear(self) -> None:
"""Reset all patterns."""
self.token_patterns = defaultdict(list)
self.phrase_patterns = defaultdict(list)
self._ent_ids = defaultdict(tuple)
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
)
def remove(self, ent_id: str) -> None:
"""Remove a pattern by its ent_id if a pattern with this ent_id was added before
ent_id (str): id of the pattern to be removed
RETURNS: None
DOCS: https://spacy.io/api/entityruler#remove
"""
label_id_pairs = [
(label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
]
if not label_id_pairs:
raise ValueError(
Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name)
)
created_labels = [
self._create_label(label, eid) for (label, eid) in label_id_pairs
]
# remove the patterns from self.phrase_patterns
self.phrase_patterns = defaultdict(
list,
{
label: val
for (label, val) in self.phrase_patterns.items()
if label not in created_labels
},
)
# remove the patterns from self.token_pattern
self.token_patterns = defaultdict(
list,
{
label: val
for (label, val) in self.token_patterns.items()
if label not in created_labels
},
)
# remove the patterns from self.token_pattern
for label in created_labels:
if label in self.phrase_matcher:
self.phrase_matcher.remove(label)
else:
self.matcher.remove(label)
def _require_patterns(self) -> None:
"""Raise a warning if this component has no patterns defined."""
if len(self) == 0:
warnings.warn(Warnings.W036.format(name=self.name))
def _split_label(self, label: str) -> Tuple[str, Optional[str]]:
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
label (str): The value of label in a pattern entry
RETURNS (tuple): ent_label, ent_id
"""
if self.ent_id_sep in label:
ent_label, ent_id = label.rsplit(self.ent_id_sep, 1)
else:
ent_label = label
ent_id = None # type: ignore
return ent_label, ent_id
def _create_label(self, label: Any, ent_id: Any) -> str:
"""Join Entity label with ent_id if the pattern has an `id` attribute
If ent_id is not a string, the label is returned as is.
label (str): The label to set for ent.label_
ent_id (str): The label
RETURNS (str): The ent_label joined with configured `ent_id_sep`
"""
if isinstance(ent_id, str):
label = f"{label}{self.ent_id_sep}{ent_id}"
return label
def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler":
"""Load the entity ruler from a bytestring.
patterns_bytes (bytes): The bytestring to load.
RETURNS (EntityRuler): The loaded entity ruler.
DOCS: https://spacy.io/api/entityruler#from_bytes
"""
cfg = srsly.msgpack_loads(patterns_bytes)
self.clear()
if isinstance(cfg, dict):
self.add_patterns(cfg.get("patterns", cfg))
self.overwrite = cfg.get("overwrite", False)
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
else:
self.add_patterns(cfg)
return self
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the entity ruler patterns to a bytestring.
RETURNS (bytes): The serialized patterns.
DOCS: https://spacy.io/api/entityruler#to_bytes
"""
serial = {
"overwrite": self.overwrite,
"ent_id_sep": self.ent_id_sep,
"phrase_matcher_attr": self.phrase_matcher_attr,
"patterns": self.patterns,
}
return srsly.msgpack_dumps(serial)
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler":
"""Load the entity ruler from a file. Expects a file containing
newline-delimited JSON (JSONL) with one entry per line.
path (str / Path): The JSONL file to load.
RETURNS (EntityRuler): The loaded entity ruler.
DOCS: https://spacy.io/api/entityruler#from_disk
"""
path = ensure_path(path)
self.clear()
depr_patterns_path = path.with_suffix(".jsonl")
if path.suffix == ".jsonl": # user provides a jsonl
if path.is_file:
patterns = srsly.read_jsonl(path)
self.add_patterns(patterns)
else:
raise ValueError(Errors.E1023.format(path=path))
elif depr_patterns_path.is_file():
patterns = srsly.read_jsonl(depr_patterns_path)
self.add_patterns(patterns)
elif path.is_dir(): # path is a valid directory
cfg = {}
deserializers_patterns = {
"patterns": lambda p: self.add_patterns(
srsly.read_jsonl(p.with_suffix(".jsonl"))
)
}
deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
from_disk(path, deserializers_cfg, {})
self.overwrite = cfg.get("overwrite", False)
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
self.phrase_matcher = PhraseMatcher(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
from_disk(path, deserializers_patterns, {})
else: # path is not a valid directory or file
raise ValueError(Errors.E146.format(path=path))
return self
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:
"""Save the entity ruler patterns to a directory. The patterns will be
saved as newline-delimited JSON (JSONL).
path (str / Path): The JSONL file to save.
DOCS: https://spacy.io/api/entityruler#to_disk
"""
path = ensure_path(path)
cfg = {
"overwrite": self.overwrite,
"phrase_matcher_attr": self.phrase_matcher_attr,
"ent_id_sep": self.ent_id_sep,
}
serializers = {
"patterns": lambda p: srsly.write_jsonl(
p.with_suffix(".jsonl"), self.patterns
),
"cfg": lambda p: srsly.write_json(p, cfg),
}
if path.suffix == ".jsonl": # user wants to save only JSONL
srsly.write_jsonl(path, self.patterns)
else:
to_disk(path, serializers, {})

View File

@ -11,7 +11,7 @@ from ..language import Language
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import ensure_path, SimpleFrozenList, registry from ..util import ensure_path, SimpleFrozenList, registry
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..scorer import Scorer from ..scorer import Scorer, get_ner_prf
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from .. import util from .. import util
@ -20,7 +20,7 @@ DEFAULT_SPANS_KEY = "ruler"
@Language.factory( @Language.factory(
"future_entity_ruler", "entity_ruler",
assigns=["doc.ents"], assigns=["doc.ents"],
default_config={ default_config={
"phrase_matcher_attr": None, "phrase_matcher_attr": None,
@ -63,6 +63,15 @@ def make_entity_ruler(
) )
def entity_ruler_score(examples, **kwargs):
return get_ner_prf(examples)
@registry.scorers("spacy.entity_ruler_scorer.v1")
def make_entity_ruler_scorer():
return entity_ruler_score
@Language.factory( @Language.factory(
"span_ruler", "span_ruler",
assigns=["doc.spans"], assigns=["doc.spans"],
@ -117,7 +126,7 @@ def prioritize_new_ents_filter(
) -> List[Span]: ) -> List[Span]:
"""Merge entities and spans into one list without overlaps by allowing """Merge entities and spans into one list without overlaps by allowing
spans to overwrite any entities that they overlap with. Intended to spans to overwrite any entities that they overlap with. Intended to
replicate the overwrite_ents=True behavior from the EntityRuler. replicate the overwrite_ents=True behavior from the v3 EntityRuler.
entities (Iterable[Span]): The entities, already filtered for overlaps. entities (Iterable[Span]): The entities, already filtered for overlaps.
spans (Iterable[Span]): The spans to merge, may contain overlaps. spans (Iterable[Span]): The spans to merge, may contain overlaps.
@ -148,7 +157,7 @@ def prioritize_existing_ents_filter(
) -> List[Span]: ) -> List[Span]:
"""Merge entities and spans into one list without overlaps by prioritizing """Merge entities and spans into one list without overlaps by prioritizing
existing entities. Intended to replicate the overwrite_ents=False behavior existing entities. Intended to replicate the overwrite_ents=False behavior
from the EntityRuler. from the v3 EntityRuler.
entities (Iterable[Span]): The entities, already filtered for overlaps. entities (Iterable[Span]): The entities, already filtered for overlaps.
spans (Iterable[Span]): The spans to merge, may contain overlaps. spans (Iterable[Span]): The spans to merge, may contain overlaps.

View File

@ -87,14 +87,15 @@ def test_issue4373():
@pytest.mark.issue(4651) @pytest.mark.issue(4651)
def test_issue4651_with_phrase_matcher_attr(): def test_issue4651_with_phrase_matcher_attr():
"""Test that the EntityRuler PhraseMatcher is deserialized correctly using """Test that the entity_ruler PhraseMatcher is deserialized correctly using
the method from_disk when the EntityRuler argument phrase_matcher_attr is the method from_disk when the entity_ruler argument phrase_matcher_attr is
specified. specified.
""" """
text = "Spacy is a python library for nlp" text = "Spacy is a python library for nlp"
nlp = English() nlp = English()
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) config = {"phrase_matcher_attr": "LOWER"}
ruler = nlp.add_pipe("entity_ruler", config=config)
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc = nlp(text) doc = nlp(text)
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
@ -102,7 +103,7 @@ def test_issue4651_with_phrase_matcher_attr():
with make_tempdir() as d: with make_tempdir() as d:
file_path = d / "entityruler" file_path = d / "entityruler"
ruler.to_disk(file_path) ruler.to_disk(file_path)
nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path) nlp_reloaded.add_pipe("entity_ruler", config=config).from_disk(file_path)
doc_reloaded = nlp_reloaded(text) doc_reloaded = nlp_reloaded(text)
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
assert res == res_reloaded assert res == res_reloaded

View File

@ -4,7 +4,7 @@ from spacy import registry
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.language import Language from spacy.language import Language
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import EntityRuler, EntityRecognizer, merge_entities from spacy.pipeline import EntityRecognizer, merge_entities
from spacy.pipeline import SpanRuler from spacy.pipeline import SpanRuler
from spacy.pipeline.ner import DEFAULT_NER_MODEL from spacy.pipeline.ner import DEFAULT_NER_MODEL
from spacy.errors import MatchPatternError from spacy.errors import MatchPatternError
@ -12,8 +12,6 @@ from spacy.tests.util import make_tempdir
from thinc.api import NumpyOps, get_current_ops from thinc.api import NumpyOps, get_current_ops
ENTITY_RULERS = ["entity_ruler", "future_entity_ruler"]
@pytest.fixture @pytest.fixture
def nlp(): def nlp():
@ -40,13 +38,12 @@ def add_ent_component(doc):
@pytest.mark.issue(3345) @pytest.mark.issue(3345)
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_issue3345():
def test_issue3345(entity_ruler_factory):
"""Test case where preset entity crosses sentence boundary.""" """Test case where preset entity crosses sentence boundary."""
nlp = English() nlp = English()
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
doc[4].is_sent_start = True doc[4].is_sent_start = True
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns([{"label": "GPE", "pattern": "New York"}]) ruler.add_patterns([{"label": "GPE", "pattern": "New York"}])
cfg = {"model": DEFAULT_NER_MODEL} cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"] model = registry.resolve(cfg, validate=True)["model"]
@ -65,15 +62,14 @@ def test_issue3345(entity_ruler_factory):
@pytest.mark.issue(4849) @pytest.mark.issue(4849)
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_issue4849():
def test_issue4849(entity_ruler_factory):
nlp = English() nlp = English()
patterns = [ patterns = [
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
] ]
ruler = nlp.add_pipe( ruler = nlp.add_pipe(
entity_ruler_factory, "entity_ruler",
name="entity_ruler", name="entity_ruler",
config={"phrase_matcher_attr": "LOWER"}, config={"phrase_matcher_attr": "LOWER"},
) )
@ -96,11 +92,10 @@ def test_issue4849(entity_ruler_factory):
@pytest.mark.issue(5918) @pytest.mark.issue(5918)
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_issue5918():
def test_issue5918(entity_ruler_factory):
# Test edge case when merging entities. # Test edge case when merging entities.
nlp = English() nlp = English()
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler = nlp.add_pipe("entity_ruler")
patterns = [ patterns = [
{"label": "ORG", "pattern": "Digicon Inc"}, {"label": "ORG", "pattern": "Digicon Inc"},
{"label": "ORG", "pattern": "Rotan Mosle Inc's"}, {"label": "ORG", "pattern": "Rotan Mosle Inc's"},
@ -125,10 +120,9 @@ def test_issue5918(entity_ruler_factory):
@pytest.mark.issue(8168) @pytest.mark.issue(8168)
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_issue8168():
def test_issue8168(entity_ruler_factory):
nlp = English() nlp = English()
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler = nlp.add_pipe("entity_ruler")
patterns = [ patterns = [
{"label": "ORG", "pattern": "Apple"}, {"label": "ORG", "pattern": "Apple"},
{ {
@ -148,12 +142,9 @@ def test_issue8168(entity_ruler_factory):
@pytest.mark.issue(8216) @pytest.mark.issue(8216)
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_fix8216(nlp, patterns):
def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
"""Test that patterns don't get added excessively.""" """Test that patterns don't get added excessively."""
ruler = nlp.add_pipe( ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
entity_ruler_factory, name="entity_ruler", config={"validate": True}
)
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
assert pattern_count > 0 assert pattern_count > 0
@ -162,16 +153,15 @@ def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory):
assert after_count == pattern_count assert after_count == pattern_count
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_init(nlp, patterns):
def test_entity_ruler_init(nlp, patterns, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler) == len(patterns) assert len(ruler) == len(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
assert "HELLO" in ruler assert "HELLO" in ruler
assert "BYE" in ruler assert "BYE" in ruler
nlp.remove_pipe("entity_ruler") nlp.remove_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc = nlp("hello world bye bye") doc = nlp("hello world bye bye")
assert len(doc.ents) == 2 assert len(doc.ents) == 2
@ -179,23 +169,21 @@ def test_entity_ruler_init(nlp, patterns, entity_ruler_factory):
assert doc.ents[1].label_ == "BYE" assert doc.ents[1].label_ == "BYE"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_no_patterns_warns(nlp):
def test_entity_ruler_no_patterns_warns(nlp, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
assert len(ruler) == 0 assert len(ruler) == 0
assert len(ruler.labels) == 0 assert len(ruler.labels) == 0
nlp.remove_pipe("entity_ruler") nlp.remove_pipe("entity_ruler")
nlp.add_pipe(entity_ruler_factory, name="entity_ruler") nlp.add_pipe("entity_ruler")
assert nlp.pipe_names == ["entity_ruler"] assert nlp.pipe_names == ["entity_ruler"]
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
doc = nlp("hello world bye bye") doc = nlp("hello world bye bye")
assert len(doc.ents) == 0 assert len(doc.ents) == 0
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_init_patterns(nlp, patterns):
def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
# initialize with patterns # initialize with patterns
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler = nlp.add_pipe("entity_ruler")
assert len(ruler.labels) == 0 assert len(ruler.labels) == 0
ruler.initialize(lambda: [], patterns=patterns) ruler.initialize(lambda: [], patterns=patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
@ -207,7 +195,7 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
nlp.config["initialize"]["components"]["entity_ruler"] = { nlp.config["initialize"]["components"]["entity_ruler"] = {
"patterns": {"@misc": "entity_ruler_patterns"} "patterns": {"@misc": "entity_ruler_patterns"}
} }
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler = nlp.add_pipe("entity_ruler")
assert len(ruler.labels) == 0 assert len(ruler.labels) == 0
nlp.initialize() nlp.initialize()
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
@ -216,20 +204,18 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory):
assert doc.ents[1].label_ == "BYE" assert doc.ents[1].label_ == "BYE"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_init_clear(nlp, patterns):
def test_entity_ruler_init_clear(nlp, patterns, entity_ruler_factory):
"""Test that initialization clears patterns.""" """Test that initialization clears patterns."""
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
ruler.initialize(lambda: []) ruler.initialize(lambda: [])
assert len(ruler.labels) == 0 assert len(ruler.labels) == 0
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_clear(nlp, patterns):
def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
"""Test that initialization clears patterns.""" """Test that initialization clears patterns."""
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
doc = nlp("hello world") doc = nlp("hello world")
@ -241,9 +227,8 @@ def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory):
assert len(doc.ents) == 0 assert len(doc.ents) == 0
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_existing(nlp, patterns):
def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
nlp.add_pipe("add_ent", before="entity_ruler") nlp.add_pipe("add_ent", before="entity_ruler")
doc = nlp("OH HELLO WORLD bye bye") doc = nlp("OH HELLO WORLD bye bye")
@ -252,11 +237,8 @@ def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory):
assert doc.ents[1].label_ == "BYE" assert doc.ents[1].label_ == "BYE"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_existing_overwrite(nlp, patterns):
def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
ruler = nlp.add_pipe(
entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
)
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
nlp.add_pipe("add_ent", before="entity_ruler") nlp.add_pipe("add_ent", before="entity_ruler")
doc = nlp("OH HELLO WORLD bye bye") doc = nlp("OH HELLO WORLD bye bye")
@ -266,11 +248,8 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory):
assert doc.ents[1].label_ == "BYE" assert doc.ents[1].label_ == "BYE"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_existing_complex(nlp, patterns):
def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
ruler = nlp.add_pipe(
entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
)
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
nlp.add_pipe("add_ent", before="entity_ruler") nlp.add_pipe("add_ent", before="entity_ruler")
doc = nlp("foo foo bye bye") doc = nlp("foo foo bye bye")
@ -281,11 +260,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory):
assert len(doc.ents[1]) == 2 assert len(doc.ents[1]) == 2
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_entity_id(nlp, patterns):
def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
ruler = nlp.add_pipe(
entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True}
)
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc = nlp("Apple is a technology company") doc = nlp("Apple is a technology company")
assert len(doc.ents) == 1 assert len(doc.ents) == 1
@ -293,26 +269,23 @@ def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory):
assert doc.ents[0].ent_id_ == "a1" assert doc.ents[0].ent_id_ == "a1"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_cfg_ent_id_sep(nlp, patterns):
def test_entity_ruler_cfg_ent_id_sep(nlp, patterns, entity_ruler_factory):
config = {"overwrite_ents": True, "ent_id_sep": "**"} config = {"overwrite_ents": True, "ent_id_sep": "**"}
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler", config=config) ruler = nlp.add_pipe("entity_ruler", config=config)
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc = nlp("Apple is a technology company") doc = nlp("Apple is a technology company")
if isinstance(ruler, EntityRuler):
assert "TECH_ORG**a1" in ruler.phrase_patterns
assert len(doc.ents) == 1 assert len(doc.ents) == 1
assert doc.ents[0].label_ == "TECH_ORG" assert doc.ents[0].label_ == "TECH_ORG"
assert doc.ents[0].ent_id_ == "a1" assert doc.ents[0].ent_id_ == "a1"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_serialize_bytes(nlp, patterns):
def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = EntityRuler(nlp, patterns=patterns) ruler.add_patterns(patterns)
assert len(ruler) == len(patterns) assert len(ruler) == len(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
ruler_bytes = ruler.to_bytes() ruler_bytes = ruler.to_bytes()
new_ruler = EntityRuler(nlp) new_ruler = nlp.add_pipe("entity_ruler", name="new_ruler")
assert len(new_ruler) == 0 assert len(new_ruler) == 0
assert len(new_ruler.labels) == 0 assert len(new_ruler.labels) == 0
new_ruler = new_ruler.from_bytes(ruler_bytes) new_ruler = new_ruler.from_bytes(ruler_bytes)
@ -324,28 +297,27 @@ def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory):
assert sorted(new_ruler.labels) == sorted(ruler.labels) assert sorted(new_ruler.labels) == sorted(ruler.labels)
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):
def test_entity_ruler_serialize_phrase_matcher_attr_bytes( ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"})
nlp, patterns, entity_ruler_factory ruler.add_patterns(patterns)
):
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER", patterns=patterns)
assert len(ruler) == len(patterns) assert len(ruler) == len(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
ruler_bytes = ruler.to_bytes() ruler_bytes = ruler.to_bytes()
new_ruler = EntityRuler(nlp) new_ruler = nlp.add_pipe(
"entity_ruler", name="new_ruler", config={"phrase_matcher_attr": "LOWER"}
)
assert len(new_ruler) == 0 assert len(new_ruler) == 0
assert len(new_ruler.labels) == 0 assert len(new_ruler.labels) == 0
assert new_ruler.phrase_matcher_attr is None
new_ruler = new_ruler.from_bytes(ruler_bytes) new_ruler = new_ruler.from_bytes(ruler_bytes)
assert len(new_ruler) == len(patterns) assert len(new_ruler) == len(patterns)
assert len(new_ruler.labels) == 4 assert len(new_ruler.labels) == 4
assert new_ruler.phrase_matcher_attr == "LOWER"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_validate(nlp):
def test_entity_ruler_validate(nlp, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") validated_ruler = nlp.add_pipe(
validated_ruler = EntityRuler(nlp, validate=True) "entity_ruler", name="validated_ruler", config={"validate": True}
)
valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]} valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]}
invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]} invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]}
@ -362,16 +334,15 @@ def test_entity_ruler_validate(nlp, entity_ruler_factory):
validated_ruler.add_patterns([invalid_pattern]) validated_ruler.add_patterns([invalid_pattern])
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_properties(nlp, patterns):
def test_entity_ruler_properties(nlp, patterns, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) ruler.add_patterns(patterns)
assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"]) assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"])
assert sorted(ruler.ent_ids) == ["a1", "a2"] assert sorted(ruler.ids) == ["a1", "a2"]
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_overlapping_spans(nlp):
def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
patterns = [ patterns = [
{"label": "FOOBAR", "pattern": "foo bar"}, {"label": "FOOBAR", "pattern": "foo bar"},
{"label": "BARBAZ", "pattern": "bar baz"}, {"label": "BARBAZ", "pattern": "bar baz"},
@ -383,14 +354,13 @@ def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
@pytest.mark.parametrize("n_process", [1, 2]) @pytest.mark.parametrize("n_process", [1, 2])
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_multiprocessing(nlp, n_process):
def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
if isinstance(get_current_ops, NumpyOps) or n_process < 2: if isinstance(get_current_ops, NumpyOps) or n_process < 2:
texts = ["I enjoy eating Pizza Hut pizza."] texts = ["I enjoy eating Pizza Hut pizza."]
patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}] patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
for doc in nlp.pipe(texts, n_process=2): for doc in nlp.pipe(texts, n_process=2):
@ -398,9 +368,8 @@ def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
assert ent.ent_id_ == "1234" assert ent.ent_id_ == "1234"
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_serialize_jsonl(nlp, patterns):
def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
with make_tempdir() as d: with make_tempdir() as d:
ruler.to_disk(d / "test_ruler.jsonl") ruler.to_disk(d / "test_ruler.jsonl")
@ -409,9 +378,8 @@ def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory):
ruler.from_disk(d / "non_existing.jsonl") # read from a bad jsonl file ruler.from_disk(d / "non_existing.jsonl") # read from a bad jsonl file
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_serialize_dir(nlp, patterns):
def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
with make_tempdir() as d: with make_tempdir() as d:
ruler.to_disk(d / "test_ruler") ruler.to_disk(d / "test_ruler")
@ -420,9 +388,8 @@ def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory):
ruler.from_disk(d / "non_existing_dir") # read from a bad directory ruler.from_disk(d / "non_existing_dir") # read from a bad directory
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_remove_basic(nlp):
def test_entity_ruler_remove_basic(nlp, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
patterns = [ patterns = [
{"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "PERSON", "pattern": "Dina", "id": "dina"},
{"label": "ORG", "pattern": "ACME", "id": "acme"}, {"label": "ORG", "pattern": "ACME", "id": "acme"},
@ -432,24 +399,16 @@ def test_entity_ruler_remove_basic(nlp, entity_ruler_factory):
doc = nlp("Dina went to school") doc = nlp("Dina went to school")
assert len(ruler.patterns) == 3 assert len(ruler.patterns) == 3
assert len(doc.ents) == 1 assert len(doc.ents) == 1
if isinstance(ruler, EntityRuler):
assert "PERSON||dina" in ruler.phrase_matcher
assert doc.ents[0].label_ == "PERSON" assert doc.ents[0].label_ == "PERSON"
assert doc.ents[0].text == "Dina" assert doc.ents[0].text == "Dina"
if isinstance(ruler, EntityRuler):
ruler.remove("dina")
else:
ruler.remove_by_id("dina") ruler.remove_by_id("dina")
doc = nlp("Dina went to school") doc = nlp("Dina went to school")
assert len(doc.ents) == 0 assert len(doc.ents) == 0
if isinstance(ruler, EntityRuler):
assert "PERSON||dina" not in ruler.phrase_matcher
assert len(ruler.patterns) == 2 assert len(ruler.patterns) == 2
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_remove_same_id_multiple_patterns(nlp):
def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
patterns = [ patterns = [
{"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "PERSON", "pattern": "Dina", "id": "dina"},
{"label": "ORG", "pattern": "DinaCorp", "id": "dina"}, {"label": "ORG", "pattern": "DinaCorp", "id": "dina"},
@ -458,25 +417,15 @@ def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc = nlp("Dina founded DinaCorp and ACME.") doc = nlp("Dina founded DinaCorp and ACME.")
assert len(ruler.patterns) == 3 assert len(ruler.patterns) == 3
if isinstance(ruler, EntityRuler):
assert "PERSON||dina" in ruler.phrase_matcher
assert "ORG||dina" in ruler.phrase_matcher
assert len(doc.ents) == 3 assert len(doc.ents) == 3
if isinstance(ruler, EntityRuler):
ruler.remove("dina")
else:
ruler.remove_by_id("dina") ruler.remove_by_id("dina")
doc = nlp("Dina founded DinaCorp and ACME.") doc = nlp("Dina founded DinaCorp and ACME.")
assert len(ruler.patterns) == 1 assert len(ruler.patterns) == 1
if isinstance(ruler, EntityRuler):
assert "PERSON||dina" not in ruler.phrase_matcher
assert "ORG||dina" not in ruler.phrase_matcher
assert len(doc.ents) == 1 assert len(doc.ents) == 1
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_remove_nonexisting_pattern(nlp):
def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
patterns = [ patterns = [
{"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "PERSON", "pattern": "Dina", "id": "dina"},
{"label": "ORG", "pattern": "ACME", "id": "acme"}, {"label": "ORG", "pattern": "ACME", "id": "acme"},
@ -491,9 +440,8 @@ def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory):
ruler.remove_by_id("nepattern") ruler.remove_by_id("nepattern")
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_remove_several_patterns(nlp):
def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
patterns = [ patterns = [
{"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "PERSON", "pattern": "Dina", "id": "dina"},
{"label": "ORG", "pattern": "ACME", "id": "acme"}, {"label": "ORG", "pattern": "ACME", "id": "acme"},
@ -507,27 +455,20 @@ def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory):
assert doc.ents[0].text == "Dina" assert doc.ents[0].text == "Dina"
assert doc.ents[1].label_ == "ORG" assert doc.ents[1].label_ == "ORG"
assert doc.ents[1].text == "ACME" assert doc.ents[1].text == "ACME"
if isinstance(ruler, EntityRuler):
ruler.remove("dina")
else:
ruler.remove_by_id("dina") ruler.remove_by_id("dina")
doc = nlp("Dina founded her company ACME") doc = nlp("Dina founded her company ACME")
assert len(ruler.patterns) == 2 assert len(ruler.patterns) == 2
assert len(doc.ents) == 1 assert len(doc.ents) == 1
assert doc.ents[0].label_ == "ORG" assert doc.ents[0].label_ == "ORG"
assert doc.ents[0].text == "ACME" assert doc.ents[0].text == "ACME"
if isinstance(ruler, EntityRuler):
ruler.remove("acme")
else:
ruler.remove_by_id("acme") ruler.remove_by_id("acme")
doc = nlp("Dina founded her company ACME") doc = nlp("Dina founded her company ACME")
assert len(ruler.patterns) == 1 assert len(ruler.patterns) == 1
assert len(doc.ents) == 0 assert len(doc.ents) == 0
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_remove_patterns_in_a_row(nlp):
def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
patterns = [ patterns = [
{"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "PERSON", "pattern": "Dina", "id": "dina"},
{"label": "ORG", "pattern": "ACME", "id": "acme"}, {"label": "ORG", "pattern": "ACME", "id": "acme"},
@ -543,11 +484,6 @@ def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
assert doc.ents[1].text == "ACME" assert doc.ents[1].text == "ACME"
assert doc.ents[2].label_ == "DATE" assert doc.ents[2].label_ == "DATE"
assert doc.ents[2].text == "her birthday" assert doc.ents[2].text == "her birthday"
if isinstance(ruler, EntityRuler):
ruler.remove("dina")
ruler.remove("acme")
ruler.remove("bday")
else:
ruler.remove_by_id("dina") ruler.remove_by_id("dina")
ruler.remove_by_id("acme") ruler.remove_by_id("acme")
ruler.remove_by_id("bday") ruler.remove_by_id("bday")
@ -555,9 +491,8 @@ def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory):
assert len(doc.ents) == 0 assert len(doc.ents) == 0
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_remove_all_patterns(nlp):
def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
patterns = [ patterns = [
{"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "PERSON", "pattern": "Dina", "id": "dina"},
{"label": "ORG", "pattern": "ACME", "id": "acme"}, {"label": "ORG", "pattern": "ACME", "id": "acme"},
@ -565,19 +500,10 @@ def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
] ]
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
assert len(ruler.patterns) == 3 assert len(ruler.patterns) == 3
if isinstance(ruler, EntityRuler):
ruler.remove("dina")
else:
ruler.remove_by_id("dina") ruler.remove_by_id("dina")
assert len(ruler.patterns) == 2 assert len(ruler.patterns) == 2
if isinstance(ruler, EntityRuler):
ruler.remove("acme")
else:
ruler.remove_by_id("acme") ruler.remove_by_id("acme")
assert len(ruler.patterns) == 1 assert len(ruler.patterns) == 1
if isinstance(ruler, EntityRuler):
ruler.remove("bday")
else:
ruler.remove_by_id("bday") ruler.remove_by_id("bday")
assert len(ruler.patterns) == 0 assert len(ruler.patterns) == 0
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
@ -585,9 +511,8 @@ def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory):
assert len(doc.ents) == 0 assert len(doc.ents) == 0
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) def test_entity_ruler_remove_and_add(nlp):
def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory): ruler = nlp.add_pipe("entity_ruler")
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
patterns = [{"label": "DATE", "pattern": "last time"}] patterns = [{"label": "DATE", "pattern": "last time"}]
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
doc = ruler( doc = ruler(
@ -608,9 +533,6 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
assert doc.ents[0].text == "last time" assert doc.ents[0].text == "last time"
assert doc.ents[1].label_ == "DATE" assert doc.ents[1].label_ == "DATE"
assert doc.ents[1].text == "this time" assert doc.ents[1].text == "this time"
if isinstance(ruler, EntityRuler):
ruler.remove("ttime")
else:
ruler.remove_by_id("ttime") ruler.remove_by_id("ttime")
doc = ruler( doc = ruler(
nlp.make_doc("I saw him last time we met, this time he brought some flowers") nlp.make_doc("I saw him last time we met, this time he brought some flowers")
@ -634,9 +556,6 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory):
) )
assert len(ruler.patterns) == 3 assert len(ruler.patterns) == 3
assert len(doc.ents) == 3 assert len(doc.ents) == 3
if isinstance(ruler, EntityRuler):
ruler.remove("ttime")
else:
ruler.remove_by_id("ttime") ruler.remove_by_id("ttime")
doc = ruler( doc = ruler(
nlp.make_doc( nlp.make_doc(

View File

@ -8,7 +8,7 @@ import spacy
from spacy import Vocab, load, registry from spacy import Vocab, load, registry
from spacy.lang.en import English from spacy.lang.en import English
from spacy.language import Language from spacy.language import Language
from spacy.pipeline import DependencyParser, EntityRecognizer, EntityRuler from spacy.pipeline import DependencyParser, EntityRecognizer
from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer
from spacy.pipeline import TrainablePipe from spacy.pipeline import TrainablePipe
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
@ -85,58 +85,17 @@ def test_issue_3526_1(en_vocab):
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
] ]
nlp = Language(vocab=en_vocab) nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
ruler.add_patterns(patterns)
ruler_bytes = ruler.to_bytes() ruler_bytes = ruler.to_bytes()
assert len(ruler) == len(patterns) assert len(ruler) == len(patterns)
assert len(ruler.labels) == 4 assert len(ruler.labels) == 4
assert ruler.overwrite new_ruler = nlp.add_pipe(
new_ruler = EntityRuler(nlp) "entity_ruler", name="new_ruler", config={"overwrite_ents": True}
)
new_ruler = new_ruler.from_bytes(ruler_bytes) new_ruler = new_ruler.from_bytes(ruler_bytes)
assert len(new_ruler) == len(ruler) assert len(new_ruler) == len(ruler)
assert len(new_ruler.labels) == 4 assert len(new_ruler.labels) == 4
assert new_ruler.overwrite == ruler.overwrite
assert new_ruler.ent_id_sep == ruler.ent_id_sep
@pytest.mark.issue(3526)
def test_issue_3526_2(en_vocab):
patterns = [
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
]
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
new_ruler = EntityRuler(nlp)
new_ruler = new_ruler.from_bytes(bytes_old_style)
assert len(new_ruler) == len(ruler)
for pattern in ruler.patterns:
assert pattern in new_ruler.patterns
assert new_ruler.overwrite is not ruler.overwrite
@pytest.mark.issue(3526)
def test_issue_3526_3(en_vocab):
patterns = [
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
]
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
with make_tempdir() as tmpdir:
out_file = tmpdir / "entity_ruler"
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
new_ruler = EntityRuler(nlp).from_disk(out_file)
for pattern in ruler.patterns:
assert pattern in new_ruler.patterns
assert len(new_ruler) == len(ruler)
assert new_ruler.overwrite is not ruler.overwrite
@pytest.mark.issue(3526) @pytest.mark.issue(3526)
@ -150,16 +109,14 @@ def test_issue_3526_4(en_vocab):
nlp.to_disk(tmpdir) nlp.to_disk(tmpdir)
ruler = nlp.get_pipe("entity_ruler") ruler = nlp.get_pipe("entity_ruler")
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
assert ruler.overwrite is True
nlp2 = load(tmpdir) nlp2 = load(tmpdir)
new_ruler = nlp2.get_pipe("entity_ruler") new_ruler = nlp2.get_pipe("entity_ruler")
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
assert new_ruler.overwrite is True
@pytest.mark.issue(4042) @pytest.mark.issue(4042)
def test_issue4042(): def test_issue4042():
"""Test that serialization of an EntityRuler before NER works fine.""" """Test that serialization of an entity_ruler before NER works fine."""
nlp = English() nlp = English()
# add ner pipe # add ner pipe
ner = nlp.add_pipe("ner") ner = nlp.add_pipe("ner")

View File

@ -1,13 +1,24 @@
--- ---
title: EntityRuler title: EntityRuler
tag: class
source: spacy/pipeline/entity_ruler.py
new: 2.1 new: 2.1
teaser: 'Pipeline component for rule-based named entity recognition' teaser: 'Pipeline component for rule-based named entity recognition'
api_string_name: entity_ruler api_string_name: entity_ruler
api_trainable: false api_trainable: false
--- ---
<Infobox title="New in v4" variant="warning">
As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
implemented as a special case of the `SpanRuler` component.
See the [migration guide](#migrating) below for differences between the v3
`EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
component.
See the [`SpanRuler`](/api/spanruler) API docs for the full API.
</Infobox>
The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using
token-based rules or exact phrase matches. It can be combined with the token-based rules or exact phrase matches. It can be combined with the
statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
@ -63,271 +74,51 @@ how the component should be configured. You can override its settings via the
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | | `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
```python ## Migrating from v3 {#migrating}
%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py
### Loading patterns
Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
JSONL file separately and then added through
[`SpanRuler.initialize`](/api/spanruler#initialize]) or
[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
```diff
ruler = nlp.get_pipe("entity_ruler")
- ruler.from_disk("patterns.jsonl")
+ import srsly
+ patterns = srsly.read_jsonl("patterns.jsonl")
+ ruler.add_patterns(patterns)
``` ```
## EntityRuler.\_\_init\_\_ {#init tag="method"} ### Saving patterns
Initialize the entity ruler. If patterns are supplied here, they need to be a `SpanRuler.to_disk` always saves the full component data to a directory and does
list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either not include an option to save the patterns to a single JSONL file.
be a token pattern (list) or a phrase pattern (string). For example:
`{"label": "ORG", "pattern": "Apple"}`.
> #### Example ```diff
> ruler = nlp.get_pipe("entity_ruler")
> ```python - ruler.to_disk("patterns.jsonl")
> # Construction via add_pipe + import srsly
> ruler = nlp.add_pipe("entity_ruler") + srsly.write_jsonl("patterns.jsonl", ruler.patterns)
> ```
> # Construction from class
> from spacy.pipeline import EntityRuler
> ruler = EntityRuler(nlp, overwrite_ents=True)
> ```
| Name | Description | ### Accessing token and phrase patterns
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
| `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
| _keyword-only_ | |
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
## EntityRuler.initialize {#initialize tag="method" new="3"} The separate token patterns and phrase patterns are no longer accessible under
`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
patterns in their original format using the property
[`SpanRuler.patterns`](/api/spanruler#patterns).
Initialize the component with data and used before training to load in rules ### Removing patterns by ID
from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method
is typically called by [`Language.initialize`](/api/language#initialize) and
lets you customize arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config.
> #### Example [`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
> remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
> ```python
> entity_ruler = nlp.add_pipe("entity_ruler")
> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.entity_ruler]
>
> [initialize.components.entity_ruler.patterns]
> @readers = "srsly.read_jsonl.v1"
> path = "corpus/entity_ruler_patterns.jsonl
> ```
| Name | Description | ```diff
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ruler = nlp.get_pipe("entity_ruler")
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ | - ruler.remove("id")
| _keyword-only_ | | + ruler.remove_by_id("id")
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | ```
| `patterns` | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~ |
## EntityRuler.\_\len\_\_ {#len tag="method"}
The number of all patterns added to the entity ruler.
> #### Example
>
> ```python
> ruler = nlp.add_pipe("entity_ruler")
> assert len(ruler) == 0
> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
> assert len(ruler) == 1
> ```
| Name | Description |
| ----------- | ------------------------------- |
| **RETURNS** | The number of patterns. ~~int~~ |
## EntityRuler.\_\_contains\_\_ {#contains tag="method"}
Whether a label is present in the patterns.
> #### Example
>
> ```python
> ruler = nlp.add_pipe("entity_ruler")
> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
> assert "ORG" in ruler
> assert not "PERSON" in ruler
> ```
| Name | Description |
| ----------- | ----------------------------------------------------- |
| `label` | The label to check. ~~str~~ |
| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
## EntityRuler.\_\_call\_\_ {#call tag="method"}
Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
happens automatically after the component has been added to the pipeline using
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
with `overwrite_ents=True`, existing entities will be replaced if they overlap
with the matches. When matches overlap in a Doc, the entity ruler prioritizes
longer patterns over shorter, and if equal the match occuring first in the Doc
is chosen.
> #### Example
>
> ```python
> ruler = nlp.add_pipe("entity_ruler")
> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
>
> doc = nlp("A text about Apple.")
> ents = [(ent.text, ent.label_) for ent in doc.ents]
> assert ents == [("Apple", "ORG")]
> ```
| Name | Description |
| ----------- | -------------------------------------------------------------------- |
| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~ |
## EntityRuler.add_patterns {#add_patterns tag="method"}
Add patterns to the entity ruler. A pattern can either be a token pattern (list
of dicts) or a phrase pattern (string). For more details, see the usage guide on
[rule-based matching](/usage/rule-based-matching).
> #### Example
>
> ```python
> patterns = [
> {"label": "ORG", "pattern": "Apple"},
> {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}
> ]
> ruler = nlp.add_pipe("entity_ruler")
> ruler.add_patterns(patterns)
> ```
| Name | Description |
| ---------- | ---------------------------------------------------------------- |
| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
## EntityRuler.remove {#remove tag="method" new="3.2.1"}
Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if the ID does not exist.
> #### Example
>
> ```python
> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
> ruler = nlp.add_pipe("entity_ruler")
> ruler.add_patterns(patterns)
> ruler.remove("apple")
> ```
| Name | Description |
| ---------- | ---------------------------------------------------------------- |
| `id` | The ID of the pattern rule. ~~str~~ |
## EntityRuler.to_disk {#to_disk tag="method"}
Save the entity ruler patterns to a directory. The patterns will be saved as
newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
only the patterns are saved as JSONL. If a directory name is provided, a
`patterns.jsonl` and `cfg` file with the component configuration is exported.
> #### Example
>
> ```python
> ruler = nlp.add_pipe("entity_ruler")
> ruler.to_disk("/path/to/patterns.jsonl") # saves patterns only
> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config
> ```
| Name | Description |
| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
## EntityRuler.from_disk {#from_disk tag="method"}
Load the entity ruler from a path. Expects either a file containing
newline-delimited JSON (JSONL) with one entry per line, or a directory
containing a `patterns.jsonl` file and a `cfg` file with the component
configuration.
> #### Example
>
> ```python
> ruler = nlp.add_pipe("entity_ruler")
> ruler.from_disk("/path/to/patterns.jsonl") # loads patterns only
> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------- |
| `path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ |
## EntityRuler.to_bytes {#to_bytes tag="method"}
Serialize the entity ruler patterns to a bytestring.
> #### Example
>
> ```python
> ruler = nlp.add_pipe("entity_ruler")
> ruler_bytes = ruler.to_bytes()
> ```
| Name | Description |
| ----------- | ---------------------------------- |
| **RETURNS** | The serialized patterns. ~~bytes~~ |
## EntityRuler.from_bytes {#from_bytes tag="method"}
Load the pipe from a bytestring. Modifies the object in place and returns it.
> #### Example
>
> ```python
> ruler_bytes = ruler.to_bytes()
> ruler = nlp.add_pipe("entity_ruler")
> ruler.from_bytes(ruler_bytes)
> ```
| Name | Description |
| ------------ | -------------------------------------------------- |
| `bytes_data` | The bytestring to load. ~~bytes~~ |
| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ |
## EntityRuler.labels {#labels tag="property"}
All labels present in the match patterns.
| Name | Description |
| ----------- | -------------------------------------- |
| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
## EntityRuler.ent_ids {#ent_ids tag="property" new="2.2.2"}
All entity IDs present in the `id` properties of the match patterns.
| Name | Description |
| ----------- | ----------------------------------- |
| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
## EntityRuler.patterns {#patterns tag="property"}
Get all patterns that were added to the entity ruler.
| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------- |
| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
## Attributes {#attributes}
| Name | Description |
| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ |
| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ |
| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ |

View File

@ -13,6 +13,17 @@ The span ruler lets you add spans to [`Doc.spans`](/api/doc#spans) and/or
usage examples, see the docs on usage examples, see the docs on
[rule-based span matching](/usage/rule-based-matching#spanruler). [rule-based span matching](/usage/rule-based-matching#spanruler).
<Infobox title="Replacement of the EntityRuler" variant="warning">
As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
implemented as a special case of the `SpanRuler` component.
See the [migration guide](/api/entityruler#migrating) for differences between
the v3 `EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
component.
</Infobox>
## Assigned Attributes {#assigned-attributes} ## Assigned Attributes {#assigned-attributes}
Matches will be saved to `Doc.spans[spans_key]` as a Matches will be saved to `Doc.spans[spans_key]` as a

View File

@ -41,25 +41,27 @@ components for different language processing tasks and also allows adding
![The processing pipeline](../../images/pipeline.svg) ![The processing pipeline](../../images/pipeline.svg)
| Name | Description | | Component name | Component class | Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------- | | ---------------------- | ---------------------------------------------------- | ------------------------------------------------------------------------------------------- |
| [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. | | `attribute_ruler` | [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. |
| [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. | | `entity_linker` | [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. |
| [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words. | | `entity_ruler` | [`SpanRuler`](/api/spanruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
| [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. | | `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words using rules and lookups. |
| [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. | | `morphologizer` | [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. |
| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. | | `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. |
| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words using rules and lookups. | | `parser` | [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. |
| [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. | | `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. |
| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. | | `sentencizer` | [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. | | `span_ruler` | [`SpanRuler`](/api/spanruler) | Add spans to the `Doc` using token-based rules or exact phrase matches. |
| [`Tagger`](/api/tagger) | Predict part-of-speech tags. | | `tagger` | [`Tagger`](/api/tagger) | Predict part-of-speech tags. |
| [`TextCategorizer`](/api/textcategorizer) | Predict categories or labels over the whole document. | | `textcat` | [`TextCategorizer`](/api/textcategorizer) | Predict exactly one category or label over a whole document. |
| [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. | | `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Predict 0, 1 or more categories or labels over a whole document. |
| [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. | | `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. |
| [`TrainablePipe`](/api/pipe) | Class that all trainable pipeline components inherit from. | | `tokenizer` | [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. |
| [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. | | `trainable_lemmatizer` | [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words. |
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | | `transformer` | [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. |
| - | [`TrainablePipe`](/api/pipe) | Class that all trainable pipeline components inherit from. |
| - | [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
### Matchers {#architecture-matchers} ### Matchers {#architecture-matchers}

View File

@ -53,9 +53,9 @@ example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll
only work if it's added after the tagger. The parser will respect pre-defined only work if it's added after the tagger. The parser will respect pre-defined
sentence boundaries, so if a previous component in the pipeline sets them, its sentence boundaries, so if a previous component in the pipeline sets them, its
dependency predictions may be different. Similarly, it matters if you add the dependency predictions may be different. Similarly, it matters if you add the
[`EntityRuler`](/api/entityruler) before or after the statistical entity [`SpanRuler`](/api/spanruler) before or after the statistical entity recognizer:
recognizer: if it's added before, the entity recognizer will take the existing if it's added before and it is writing to `doc.ents`, then the entity recognizer
entities into account when making predictions. The will take those existing entities into account when making predictions. The
[`EntityLinker`](/api/entitylinker), which resolves named entities to knowledge [`EntityLinker`](/api/entitylinker), which resolves named entities to knowledge
base IDs, should be preceded by a pipeline component that recognizes entities base IDs, should be preceded by a pipeline component that recognizes entities
such as the [`EntityRecognizer`](/api/entityrecognizer). such as the [`EntityRecognizer`](/api/entityrecognizer).

View File

@ -303,13 +303,14 @@ available pipeline components and component functions.
> ruler = nlp.add_pipe("entity_ruler") > ruler = nlp.add_pipe("entity_ruler")
> ``` > ```
| String name | Component | Description | | Component name | Component class | Description |
| ---------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- | | ---------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- |
| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | | `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. |
| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | | `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. |
| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | | `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. |
| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | | `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. | | `span_ruler` | [`SpanRuler`](/api/spanruler) | Assign spans based on pattern rules and dictionaries. |
| `entity_ruler` | [`SpanRuler`](/api/spanruler) | Assign named entities based on pattern rules and dictionaries. |
| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories: exactly one category is predicted per document. | | `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories: exactly one category is predicted per document. |
| `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document. | | `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document. |
| `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words using rules and lookups. | | `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words using rules and lookups. |

View File

@ -375,7 +375,7 @@ scoped quantifiers instead, you can build those behaviors with `on_match`
callbacks. callbacks.
| OP | Description | | OP | Description |
|---------|------------------------------------------------------------------------| | ------- | ---------------------------------------------------------------------- |
| `!` | Negate the pattern, by requiring it to match exactly 0 times. | | `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. | | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. | | `+` | Require the pattern to match 1 or more times. |
@ -471,7 +471,7 @@ matches = matcher(doc)
``` ```
A very similar logic has been implemented in the built-in A very similar logic has been implemented in the built-in
[`EntityRuler`](/api/entityruler) by the way. It also takes care of handling [`entity_ruler`](/api/entityruler) by the way. It also takes care of handling
overlapping matches, which you would otherwise have to take care of yourself. overlapping matches, which you would otherwise have to take care of yourself.
> #### Tip: Visualizing matches > #### Tip: Visualizing matches
@ -1270,7 +1270,7 @@ of patterns such as `{}` that match any token in the sentence.
## Rule-based entity recognition {#entityruler new="2.1"} ## Rule-based entity recognition {#entityruler new="2.1"}
The [`EntityRuler`](/api/entityruler) is a component that lets you add named The [`entity_ruler`](/api/entityruler) is a component that lets you add named
entities based on pattern dictionaries, which makes it easy to combine entities based on pattern dictionaries, which makes it easy to combine
rule-based and statistical named entity recognition for even more powerful rule-based and statistical named entity recognition for even more powerful
pipelines. pipelines.
@ -1295,13 +1295,12 @@ pattern. The entity ruler accepts two types of patterns:
### Using the entity ruler {#entityruler-usage} ### Using the entity ruler {#entityruler-usage}
The [`EntityRuler`](/api/entityruler) is a pipeline component that's typically The `entity_ruler` is a pipeline component that's typically added via
added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is called on a
called on a text, it will find matches in the `doc` and add them as entities to text, it will find matches in the `doc` and add them as entities to `doc.ents`,
the `doc.ents`, using the specified pattern label as the entity label. If any using the specified pattern label as the entity label. If any matches were to
matches were to overlap, the pattern matching most tokens takes priority. If overlap, the pattern matching most tokens takes priority. If they also happen to
they also happen to be equally long, then the match occurring first in the `Doc` be equally long, then the match occurring first in the `Doc` is chosen.
is chosen.
```python ```python
### {executable="true"} ### {executable="true"}
@ -1339,7 +1338,7 @@ doc = nlp("MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents]) print([(ent.text, ent.label_) for ent in doc.ents])
``` ```
#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation new="2.1.8"} #### Validating and debugging entity ruler patterns {#entityruler-pattern-validation new="2.1.8"}
The entity ruler can validate patterns against a JSON schema with the config The entity ruler can validate patterns against a JSON schema with the config
setting `"validate"`. See details under setting `"validate"`. See details under
@ -1351,9 +1350,9 @@ ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
### Adding IDs to patterns {#entityruler-ent-ids new="2.2.2"} ### Adding IDs to patterns {#entityruler-ent-ids new="2.2.2"}
The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each The [`entity_ruler`](/api/entityruler) can also accept an `id` attribute for
pattern. Using the `id` attribute allows multiple patterns to be associated with each pattern. Using the `id` attribute allows multiple patterns to be associated
the same entity. with the same entity.
```python ```python
### {executable="true"} ### {executable="true"}
@ -1373,10 +1372,10 @@ doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents]) print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents])
``` ```
If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) If the `id` attribute is included in the [`entity_ruler`](/api/entityruler)
patterns, the `id_` property of the matched entity is set to the `id` given patterns, the `id_` property of the matched entity is set to the `id` given in
in the patterns. So in the example above it's easy to identify that "San the patterns. So in the example above it's easy to identify that "San Francisco"
Francisco" and "San Fran" are both the same entity. and "San Fran" are both the same entity.
### Using pattern files {#entityruler-files} ### Using pattern files {#entityruler-files}
@ -1400,13 +1399,13 @@ new_ruler = nlp.add_pipe("entity_ruler").from_disk("./patterns.jsonl")
If you're using the [Prodigy](https://prodi.gy) annotation tool, you might If you're using the [Prodigy](https://prodi.gy) annotation tool, you might
recognize these pattern files from bootstrapping your named entity and text recognize these pattern files from bootstrapping your named entity and text
classification labelling. The patterns for the `EntityRuler` follow the same classification labelling. The patterns for the `entity_ruler` follow the same
syntax, so you can use your existing Prodigy pattern files in spaCy, and vice syntax, so you can use your existing Prodigy pattern files in spaCy, and vice
versa. versa.
</Infobox> </Infobox>
When you save out an `nlp` object that has an `EntityRuler` added to its When you save out an `nlp` object that has an `entity_ruler` added to its
pipeline, its patterns are automatically exported to the pipeline directory: pipeline, its patterns are automatically exported to the pipeline directory:
```python ```python
@ -1429,9 +1428,9 @@ rules included!
When using a large amount of **phrase patterns** (roughly > 10000) it's useful When using a large amount of **phrase patterns** (roughly > 10000) it's useful
to understand how the `add_patterns` function of the entity ruler works. For to understand how the `add_patterns` function of the entity ruler works. For
each **phrase pattern**, the EntityRuler calls the nlp object to construct a doc each **phrase pattern**, the entity ruler calls the nlp object to construct a
object. This happens in case you try to add the EntityRuler at the end of an doc object. This happens in case you try to add the entity ruler at the end of
existing pipeline with, for example, a POS tagger and want to extract matches an existing pipeline with, for example, a POS tagger and want to extract matches
based on the pattern's POS signature. In this case you would pass a config value based on the pattern's POS signature. In this case you would pass a config value
of `"phrase_matcher_attr": "POS"` for the entity ruler. of `"phrase_matcher_attr": "POS"` for the entity ruler.

View File

@ -193,13 +193,13 @@ the data to and from a JSON file.
> #### Real-world example > #### Real-world example
> >
> To see custom serialization methods in action, check out the new > To see custom serialization methods in action, check out the
> [`EntityRuler`](/api/entityruler) component and its > [`SpanRuler`](/api/spanruler) component and its
> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the > [source](%%GITHUB_SPACY/spacy/pipeline/span_ruler.py). Patterns added to the
> component will be saved to a `.jsonl` file if the pipeline is serialized to > component will be saved to a `.jsonl` file if the pipeline is serialized to
> disk, and to a bytestring if the pipeline is serialized to bytes. This allows > disk, and to a bytestring if the pipeline is serialized to bytes. This allows
> saving out a pipeline with a rule-based entity recognizer and including all > saving out a pipeline with rule-based components _with_ all the component
> rules _with_ the component data. > data.
```python ```python
### {highlight="16-23,25-30"} ### {highlight="16-23,25-30"}

View File

@ -424,7 +424,7 @@ your components during training, and the most common scenarios are:
2. Update an existing **trained component** with more examples. 2. Update an existing **trained component** with more examples.
3. Include an existing trained component without updating it. 3. Include an existing trained component without updating it.
4. Include a non-trainable component, like a rule-based 4. Include a non-trainable component, like a rule-based
[`EntityRuler`](/api/entityruler) or [`Sentencizer`](/api/sentencizer), or a [`SpanRuler`](/api/spanruler) or [`Sentencizer`](/api/sentencizer), or a
fully [custom component](/usage/processing-pipelines#custom-components). fully [custom component](/usage/processing-pipelines#custom-components).
If a component block defines a `factory`, spaCy will look it up in the If a component block defines a `factory`, spaCy will look it up in the