diff --git a/spacy/errors.py b/spacy/errors.py index 856660106..820f7352e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -460,13 +460,13 @@ class Errors(metaclass=ErrorsWithCodes): "same, but found '{nlp}' and '{vocab}' respectively.") E152 = ("The attribute {attr} is not supported for token patterns. " "Please use the option `validate=True` with the Matcher, PhraseMatcher, " - "EntityRuler or AttributeRuler for more details.") + "SpanRuler or AttributeRuler for more details.") E153 = ("The value type {vtype} is not supported for token patterns. " "Please use the option validate=True with Matcher, PhraseMatcher, " - "EntityRuler or AttributeRuler for more details.") + "SpanRuler or AttributeRuler for more details.") E154 = ("One of the attributes or values is not supported for token " "patterns. Please use the option `validate=True` with the Matcher, " - "PhraseMatcher, or EntityRuler for more details.") + "PhraseMatcher, or SpanRuler for more details.") E155 = ("The pipeline needs to include a {pipe} in order to use " "Matcher or PhraseMatcher with the attribute {attr}. " "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` " @@ -917,8 +917,6 @@ class Errors(metaclass=ErrorsWithCodes): E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " "Non-UD tags should use the `tag` property.") E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") - E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't " - "exist.") E1024 = ("A pattern with {attr_type} '{label}' is not present in " "'{component}' patterns.") E1025 = ("Cannot intify the value '{value}' as an IOB string. The only " diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 4744a989b..14dfed949 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -3,7 +3,6 @@ from .dep_parser import DependencyParser from .edit_tree_lemmatizer import EditTreeLemmatizer from .entity_linker import EntityLinker from .ner import EntityRecognizer -from .entity_ruler import EntityRuler from .lemmatizer import Lemmatizer from .morphologizer import Morphologizer from .pipe import Pipe @@ -23,7 +22,6 @@ __all__ = [ "DependencyParser", "EntityLinker", "EntityRecognizer", - "EntityRuler", "Morphologizer", "Lemmatizer", "MultiLabel_TextCategorizer", diff --git a/spacy/pipeline/entity_ruler.py b/spacy/pipeline/entity_ruler.py deleted file mode 100644 index 8154a077d..000000000 --- a/spacy/pipeline/entity_ruler.py +++ /dev/null @@ -1,525 +0,0 @@ -from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence -import warnings -from collections import defaultdict -from pathlib import Path -import srsly - -from .pipe import Pipe -from ..training import Example -from ..language import Language -from ..errors import Errors, Warnings -from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry -from ..tokens import Doc, Span -from ..matcher import Matcher, PhraseMatcher -from ..scorer import get_ner_prf - - -DEFAULT_ENT_ID_SEP = "||" -PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] - - -@Language.factory( - "entity_ruler", - assigns=["doc.ents", "token.ent_type", "token.ent_iob"], - default_config={ - "phrase_matcher_attr": None, - "validate": False, - "overwrite_ents": False, - "ent_id_sep": DEFAULT_ENT_ID_SEP, - "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, - }, - default_score_weights={ - "ents_f": 1.0, - "ents_p": 0.0, - "ents_r": 0.0, - "ents_per_type": None, - }, -) -def make_entity_ruler( - nlp: Language, - name: str, - phrase_matcher_attr: Optional[Union[int, str]], - validate: bool, - overwrite_ents: bool, - ent_id_sep: str, - scorer: Optional[Callable], -): - return EntityRuler( - nlp, - name, - phrase_matcher_attr=phrase_matcher_attr, - validate=validate, - overwrite_ents=overwrite_ents, - ent_id_sep=ent_id_sep, - scorer=scorer, - ) - - -def entity_ruler_score(examples, **kwargs): - return get_ner_prf(examples) - - -@registry.scorers("spacy.entity_ruler_scorer.v1") -def make_entity_ruler_scorer(): - return entity_ruler_score - - -class EntityRuler(Pipe): - """The EntityRuler lets you add spans to the `Doc.ents` using token-based - rules or exact phrase matches. It can be combined with the statistical - `EntityRecognizer` to boost accuracy, or used on its own to implement a - purely rule-based entity recognition system. After initialization, the - component is typically added to the pipeline using `nlp.add_pipe`. - - DOCS: https://spacy.io/api/entityruler - USAGE: https://spacy.io/usage/rule-based-matching#entityruler - """ - - def __init__( - self, - nlp: Language, - name: str = "entity_ruler", - *, - phrase_matcher_attr: Optional[Union[int, str]] = None, - validate: bool = False, - overwrite_ents: bool = False, - ent_id_sep: str = DEFAULT_ENT_ID_SEP, - patterns: Optional[List[PatternType]] = None, - scorer: Optional[Callable] = entity_ruler_score, - ) -> None: - """Initialize the entity ruler. If patterns are supplied here, they - need to be a list of dictionaries with a `"label"` and `"pattern"` - key. A pattern can either be a token pattern (list) or a phrase pattern - (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`. - - nlp (Language): The shared nlp object to pass the vocab to the matchers - and process phrase patterns. - name (str): Instance name of the current pipeline component. Typically - passed in automatically from the factory when the component is - added. Used to disable the current entity ruler while creating - phrase patterns with the nlp object. - phrase_matcher_attr (int / str): Token attribute to match on, passed - to the internal PhraseMatcher as `attr` - validate (bool): Whether patterns should be validated, passed to - Matcher and PhraseMatcher as `validate` - patterns (iterable): Optional patterns to load in. - overwrite_ents (bool): If existing entities are present, e.g. entities - added by the model, overwrite them by matches if necessary. - ent_id_sep (str): Separator used internally for entity IDs. - scorer (Optional[Callable]): The scoring method. Defaults to - spacy.scorer.get_ner_prf. - - DOCS: https://spacy.io/api/entityruler#init - """ - self.nlp = nlp - self.name = name - self.overwrite = overwrite_ents - self.token_patterns = defaultdict(list) # type: ignore - self.phrase_patterns = defaultdict(list) # type: ignore - self._validate = validate - self.matcher = Matcher(nlp.vocab, validate=validate) - self.phrase_matcher_attr = phrase_matcher_attr - self.phrase_matcher = PhraseMatcher( - nlp.vocab, attr=self.phrase_matcher_attr, validate=validate - ) - self.ent_id_sep = ent_id_sep - self._ent_ids = defaultdict(tuple) # type: ignore - if patterns is not None: - self.add_patterns(patterns) - self.scorer = scorer - - def __len__(self) -> int: - """The number of all patterns added to the entity ruler.""" - n_token_patterns = sum(len(p) for p in self.token_patterns.values()) - n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values()) - return n_token_patterns + n_phrase_patterns - - def __contains__(self, label: str) -> bool: - """Whether a label is present in the patterns.""" - return label in self.token_patterns or label in self.phrase_patterns - - def __call__(self, doc: Doc) -> Doc: - """Find matches in document and add them as entities. - - doc (Doc): The Doc object in the pipeline. - RETURNS (Doc): The Doc with added entities, if available. - - DOCS: https://spacy.io/api/entityruler#call - """ - error_handler = self.get_error_handler() - try: - matches = self.match(doc) - self.set_annotations(doc, matches) - return doc - except Exception as e: - return error_handler(self.name, self, [doc], e) - - def match(self, doc: Doc): - self._require_patterns() - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="\\[W036") - matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) - - final_matches = set( - [(m_id, start, end) for m_id, start, end in matches if start != end] - ) - get_sort_key = lambda m: (m[2] - m[1], -m[1]) - final_matches = sorted(final_matches, key=get_sort_key, reverse=True) - return final_matches - - def set_annotations(self, doc, matches): - """Modify the document in place""" - entities = list(doc.ents) - new_entities = [] - seen_tokens = set() - for match_id, start, end in matches: - if any(t.ent_type for t in doc[start:end]) and not self.overwrite: - continue - # check for end - 1 here because boundaries are inclusive - if start not in seen_tokens and end - 1 not in seen_tokens: - if match_id in self._ent_ids: - label, ent_id = self._ent_ids[match_id] - span = Span(doc, start, end, label=label, span_id=ent_id) - else: - span = Span(doc, start, end, label=match_id) - new_entities.append(span) - entities = [ - e for e in entities if not (e.start < end and e.end > start) - ] - seen_tokens.update(range(start, end)) - doc.ents = entities + new_entities - - @property - def labels(self) -> Tuple[str, ...]: - """All labels present in the match patterns. - - RETURNS (set): The string labels. - - DOCS: https://spacy.io/api/entityruler#labels - """ - keys = set(self.token_patterns.keys()) - keys.update(self.phrase_patterns.keys()) - all_labels = set() - - for l in keys: - if self.ent_id_sep in l: - label, _ = self._split_label(l) - all_labels.add(label) - else: - all_labels.add(l) - return tuple(sorted(all_labels)) - - def initialize( - self, - get_examples: Callable[[], Iterable[Example]], - *, - nlp: Optional[Language] = None, - patterns: Optional[Sequence[PatternType]] = None, - ): - """Initialize the pipe for training. - - get_examples (Callable[[], Iterable[Example]]): Function that - returns a representative sample of gold-standard Example objects. - nlp (Language): The current nlp object the component is part of. - patterns Optional[Iterable[PatternType]]: The list of patterns. - - DOCS: https://spacy.io/api/entityruler#initialize - """ - self.clear() - if patterns: - self.add_patterns(patterns) # type: ignore[arg-type] - - @property - def ent_ids(self) -> Tuple[Optional[str], ...]: - """All entity ids present in the match patterns `id` properties - - RETURNS (set): The string entity ids. - - DOCS: https://spacy.io/api/entityruler#ent_ids - """ - keys = set(self.token_patterns.keys()) - keys.update(self.phrase_patterns.keys()) - all_ent_ids = set() - - for l in keys: - if self.ent_id_sep in l: - _, ent_id = self._split_label(l) - all_ent_ids.add(ent_id) - return tuple(all_ent_ids) - - @property - def patterns(self) -> List[PatternType]: - """Get all patterns that were added to the entity ruler. - - RETURNS (list): The original patterns, one dictionary per pattern. - - DOCS: https://spacy.io/api/entityruler#patterns - """ - all_patterns = [] - for label, patterns in self.token_patterns.items(): - for pattern in patterns: - ent_label, ent_id = self._split_label(label) - p = {"label": ent_label, "pattern": pattern} - if ent_id: - p["id"] = ent_id - all_patterns.append(p) - for label, patterns in self.phrase_patterns.items(): - for pattern in patterns: - ent_label, ent_id = self._split_label(label) - p = {"label": ent_label, "pattern": pattern.text} - if ent_id: - p["id"] = ent_id - all_patterns.append(p) - return all_patterns - - def add_patterns(self, patterns: List[PatternType]) -> None: - """Add patterns to the entity ruler. A pattern can either be a token - pattern (list of dicts) or a phrase pattern (string). For example: - {'label': 'ORG', 'pattern': 'Apple'} - {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]} - - patterns (list): The patterns to add. - - DOCS: https://spacy.io/api/entityruler#add_patterns - """ - - # disable the nlp components after this one in case they hadn't been initialized / deserialised yet - try: - current_index = -1 - for i, (name, pipe) in enumerate(self.nlp.pipeline): - if self == pipe: - current_index = i - break - subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]] - except ValueError: - subsequent_pipes = [] - with self.nlp.select_pipes(disable=subsequent_pipes): - token_patterns = [] - phrase_pattern_labels = [] - phrase_pattern_texts = [] - phrase_pattern_ids = [] - for entry in patterns: - if isinstance(entry["pattern"], str): - phrase_pattern_labels.append(entry["label"]) - phrase_pattern_texts.append(entry["pattern"]) - phrase_pattern_ids.append(entry.get("id")) - elif isinstance(entry["pattern"], list): - token_patterns.append(entry) - phrase_patterns = [] - for label, pattern, ent_id in zip( - phrase_pattern_labels, - self.nlp.pipe(phrase_pattern_texts), - phrase_pattern_ids, - ): - phrase_pattern = {"label": label, "pattern": pattern} - if ent_id: - phrase_pattern["id"] = ent_id - phrase_patterns.append(phrase_pattern) - for entry in token_patterns + phrase_patterns: # type: ignore[operator] - label = entry["label"] # type: ignore - if "id" in entry: - ent_label = label - label = self._create_label(label, entry["id"]) - key = self.matcher._normalize_key(label) - self._ent_ids[key] = (ent_label, entry["id"]) - pattern = entry["pattern"] # type: ignore - if isinstance(pattern, Doc): - self.phrase_patterns[label].append(pattern) - self.phrase_matcher.add(label, [pattern]) # type: ignore - elif isinstance(pattern, list): - self.token_patterns[label].append(pattern) - self.matcher.add(label, [pattern]) - else: - raise ValueError(Errors.E097.format(pattern=pattern)) - - def clear(self) -> None: - """Reset all patterns.""" - self.token_patterns = defaultdict(list) - self.phrase_patterns = defaultdict(list) - self._ent_ids = defaultdict(tuple) - self.matcher = Matcher(self.nlp.vocab, validate=self._validate) - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate - ) - - def remove(self, ent_id: str) -> None: - """Remove a pattern by its ent_id if a pattern with this ent_id was added before - - ent_id (str): id of the pattern to be removed - RETURNS: None - DOCS: https://spacy.io/api/entityruler#remove - """ - label_id_pairs = [ - (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id - ] - if not label_id_pairs: - raise ValueError( - Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name) - ) - created_labels = [ - self._create_label(label, eid) for (label, eid) in label_id_pairs - ] - # remove the patterns from self.phrase_patterns - self.phrase_patterns = defaultdict( - list, - { - label: val - for (label, val) in self.phrase_patterns.items() - if label not in created_labels - }, - ) - # remove the patterns from self.token_pattern - self.token_patterns = defaultdict( - list, - { - label: val - for (label, val) in self.token_patterns.items() - if label not in created_labels - }, - ) - # remove the patterns from self.token_pattern - for label in created_labels: - if label in self.phrase_matcher: - self.phrase_matcher.remove(label) - else: - self.matcher.remove(label) - - def _require_patterns(self) -> None: - """Raise a warning if this component has no patterns defined.""" - if len(self) == 0: - warnings.warn(Warnings.W036.format(name=self.name)) - - def _split_label(self, label: str) -> Tuple[str, Optional[str]]: - """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep - - label (str): The value of label in a pattern entry - RETURNS (tuple): ent_label, ent_id - """ - if self.ent_id_sep in label: - ent_label, ent_id = label.rsplit(self.ent_id_sep, 1) - else: - ent_label = label - ent_id = None # type: ignore - return ent_label, ent_id - - def _create_label(self, label: Any, ent_id: Any) -> str: - """Join Entity label with ent_id if the pattern has an `id` attribute - If ent_id is not a string, the label is returned as is. - - label (str): The label to set for ent.label_ - ent_id (str): The label - RETURNS (str): The ent_label joined with configured `ent_id_sep` - """ - if isinstance(ent_id, str): - label = f"{label}{self.ent_id_sep}{ent_id}" - return label - - def from_bytes( - self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() - ) -> "EntityRuler": - """Load the entity ruler from a bytestring. - - patterns_bytes (bytes): The bytestring to load. - RETURNS (EntityRuler): The loaded entity ruler. - - DOCS: https://spacy.io/api/entityruler#from_bytes - """ - cfg = srsly.msgpack_loads(patterns_bytes) - self.clear() - if isinstance(cfg, dict): - self.add_patterns(cfg.get("patterns", cfg)) - self.overwrite = cfg.get("overwrite", False) - self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None) - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr - ) - self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) - else: - self.add_patterns(cfg) - return self - - def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: - """Serialize the entity ruler patterns to a bytestring. - - RETURNS (bytes): The serialized patterns. - - DOCS: https://spacy.io/api/entityruler#to_bytes - """ - serial = { - "overwrite": self.overwrite, - "ent_id_sep": self.ent_id_sep, - "phrase_matcher_attr": self.phrase_matcher_attr, - "patterns": self.patterns, - } - return srsly.msgpack_dumps(serial) - - def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() - ) -> "EntityRuler": - """Load the entity ruler from a file. Expects a file containing - newline-delimited JSON (JSONL) with one entry per line. - - path (str / Path): The JSONL file to load. - RETURNS (EntityRuler): The loaded entity ruler. - - DOCS: https://spacy.io/api/entityruler#from_disk - """ - path = ensure_path(path) - self.clear() - depr_patterns_path = path.with_suffix(".jsonl") - if path.suffix == ".jsonl": # user provides a jsonl - if path.is_file: - patterns = srsly.read_jsonl(path) - self.add_patterns(patterns) - else: - raise ValueError(Errors.E1023.format(path=path)) - elif depr_patterns_path.is_file(): - patterns = srsly.read_jsonl(depr_patterns_path) - self.add_patterns(patterns) - elif path.is_dir(): # path is a valid directory - cfg = {} - deserializers_patterns = { - "patterns": lambda p: self.add_patterns( - srsly.read_jsonl(p.with_suffix(".jsonl")) - ) - } - deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))} - from_disk(path, deserializers_cfg, {}) - self.overwrite = cfg.get("overwrite", False) - self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") - self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) - - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr - ) - from_disk(path, deserializers_patterns, {}) - else: # path is not a valid directory or file - raise ValueError(Errors.E146.format(path=path)) - return self - - def to_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() - ) -> None: - """Save the entity ruler patterns to a directory. The patterns will be - saved as newline-delimited JSON (JSONL). - - path (str / Path): The JSONL file to save. - - DOCS: https://spacy.io/api/entityruler#to_disk - """ - path = ensure_path(path) - cfg = { - "overwrite": self.overwrite, - "phrase_matcher_attr": self.phrase_matcher_attr, - "ent_id_sep": self.ent_id_sep, - } - serializers = { - "patterns": lambda p: srsly.write_jsonl( - p.with_suffix(".jsonl"), self.patterns - ), - "cfg": lambda p: srsly.write_json(p, cfg), - } - if path.suffix == ".jsonl": # user wants to save only JSONL - srsly.write_jsonl(path, self.patterns) - else: - to_disk(path, serializers, {}) diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py index 807a4ffe5..e39b89073 100644 --- a/spacy/pipeline/span_ruler.py +++ b/spacy/pipeline/span_ruler.py @@ -11,7 +11,7 @@ from ..language import Language from ..errors import Errors, Warnings from ..util import ensure_path, SimpleFrozenList, registry from ..tokens import Doc, Span -from ..scorer import Scorer +from ..scorer import Scorer, get_ner_prf from ..matcher import Matcher, PhraseMatcher from .. import util @@ -20,7 +20,7 @@ DEFAULT_SPANS_KEY = "ruler" @Language.factory( - "future_entity_ruler", + "entity_ruler", assigns=["doc.ents"], default_config={ "phrase_matcher_attr": None, @@ -63,6 +63,15 @@ def make_entity_ruler( ) +def entity_ruler_score(examples, **kwargs): + return get_ner_prf(examples) + + +@registry.scorers("spacy.entity_ruler_scorer.v1") +def make_entity_ruler_scorer(): + return entity_ruler_score + + @Language.factory( "span_ruler", assigns=["doc.spans"], @@ -117,7 +126,7 @@ def prioritize_new_ents_filter( ) -> List[Span]: """Merge entities and spans into one list without overlaps by allowing spans to overwrite any entities that they overlap with. Intended to - replicate the overwrite_ents=True behavior from the EntityRuler. + replicate the overwrite_ents=True behavior from the v3 EntityRuler. entities (Iterable[Span]): The entities, already filtered for overlaps. spans (Iterable[Span]): The spans to merge, may contain overlaps. @@ -148,7 +157,7 @@ def prioritize_existing_ents_filter( ) -> List[Span]: """Merge entities and spans into one list without overlaps by prioritizing existing entities. Intended to replicate the overwrite_ents=False behavior - from the EntityRuler. + from the v3 EntityRuler. entities (Iterable[Span]): The entities, already filtered for overlaps. spans (Iterable[Span]): The spans to merge, may contain overlaps. diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index b462b1878..20d0febb8 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -87,14 +87,15 @@ def test_issue4373(): @pytest.mark.issue(4651) def test_issue4651_with_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialized correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is + """Test that the entity_ruler PhraseMatcher is deserialized correctly using + the method from_disk when the entity_ruler argument phrase_matcher_attr is specified. """ text = "Spacy is a python library for nlp" nlp = English() patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) + config = {"phrase_matcher_attr": "LOWER"} + ruler = nlp.add_pipe("entity_ruler", config=config) ruler.add_patterns(patterns) doc = nlp(text) res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] @@ -102,7 +103,7 @@ def test_issue4651_with_phrase_matcher_attr(): with make_tempdir() as d: file_path = d / "entityruler" ruler.to_disk(file_path) - nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path) + nlp_reloaded.add_pipe("entity_ruler", config=config).from_disk(file_path) doc_reloaded = nlp_reloaded(text) res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] assert res == res_reloaded diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 6851e2a7c..440849e84 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -4,7 +4,7 @@ from spacy import registry from spacy.tokens import Doc, Span from spacy.language import Language from spacy.lang.en import English -from spacy.pipeline import EntityRuler, EntityRecognizer, merge_entities +from spacy.pipeline import EntityRecognizer, merge_entities from spacy.pipeline import SpanRuler from spacy.pipeline.ner import DEFAULT_NER_MODEL from spacy.errors import MatchPatternError @@ -12,8 +12,6 @@ from spacy.tests.util import make_tempdir from thinc.api import NumpyOps, get_current_ops -ENTITY_RULERS = ["entity_ruler", "future_entity_ruler"] - @pytest.fixture def nlp(): @@ -40,13 +38,12 @@ def add_ent_component(doc): @pytest.mark.issue(3345) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_issue3345(entity_ruler_factory): +def test_issue3345(): """Test case where preset entity crosses sentence boundary.""" nlp = English() doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns([{"label": "GPE", "pattern": "New York"}]) cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] @@ -65,15 +62,14 @@ def test_issue3345(entity_ruler_factory): @pytest.mark.issue(4849) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_issue4849(entity_ruler_factory): +def test_issue4849(): nlp = English() patterns = [ {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, ] ruler = nlp.add_pipe( - entity_ruler_factory, + "entity_ruler", name="entity_ruler", config={"phrase_matcher_attr": "LOWER"}, ) @@ -96,11 +92,10 @@ def test_issue4849(entity_ruler_factory): @pytest.mark.issue(5918) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_issue5918(entity_ruler_factory): +def test_issue5918(): # Test edge case when merging entities. nlp = English() - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "ORG", "pattern": "Digicon Inc"}, {"label": "ORG", "pattern": "Rotan Mosle Inc's"}, @@ -125,10 +120,9 @@ def test_issue5918(entity_ruler_factory): @pytest.mark.issue(8168) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_issue8168(entity_ruler_factory): +def test_issue8168(): nlp = English() - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "ORG", "pattern": "Apple"}, { @@ -148,12 +142,9 @@ def test_issue8168(entity_ruler_factory): @pytest.mark.issue(8216) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory): +def test_entity_ruler_fix8216(nlp, patterns): """Test that patterns don't get added excessively.""" - ruler = nlp.add_pipe( - entity_ruler_factory, name="entity_ruler", config={"validate": True} - ) + ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) ruler.add_patterns(patterns) pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) assert pattern_count > 0 @@ -162,16 +153,15 @@ def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory): assert after_count == pattern_count -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_init(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_init(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 assert "HELLO" in ruler assert "BYE" in ruler nlp.remove_pipe("entity_ruler") - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) doc = nlp("hello world bye bye") assert len(doc.ents) == 2 @@ -179,23 +169,21 @@ def test_entity_ruler_init(nlp, patterns, entity_ruler_factory): assert doc.ents[1].label_ == "BYE" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_no_patterns_warns(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_no_patterns_warns(nlp): + ruler = nlp.add_pipe("entity_ruler") assert len(ruler) == 0 assert len(ruler.labels) == 0 nlp.remove_pipe("entity_ruler") - nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + nlp.add_pipe("entity_ruler") assert nlp.pipe_names == ["entity_ruler"] with pytest.warns(UserWarning): doc = nlp("hello world bye bye") assert len(doc.ents) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory): +def test_entity_ruler_init_patterns(nlp, patterns): # initialize with patterns - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") assert len(ruler.labels) == 0 ruler.initialize(lambda: [], patterns=patterns) assert len(ruler.labels) == 4 @@ -207,7 +195,7 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory): nlp.config["initialize"]["components"]["entity_ruler"] = { "patterns": {"@misc": "entity_ruler_patterns"} } - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") assert len(ruler.labels) == 0 nlp.initialize() assert len(ruler.labels) == 4 @@ -216,20 +204,18 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory): assert doc.ents[1].label_ == "BYE" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_init_clear(nlp, patterns, entity_ruler_factory): +def test_entity_ruler_init_clear(nlp, patterns): """Test that initialization clears patterns.""" - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) assert len(ruler.labels) == 4 ruler.initialize(lambda: []) assert len(ruler.labels) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory): +def test_entity_ruler_clear(nlp, patterns): """Test that initialization clears patterns.""" - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) assert len(ruler.labels) == 4 doc = nlp("hello world") @@ -241,9 +227,8 @@ def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory): assert len(doc.ents) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_existing(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) nlp.add_pipe("add_ent", before="entity_ruler") doc = nlp("OH HELLO WORLD bye bye") @@ -252,11 +237,8 @@ def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory): assert doc.ents[1].label_ == "BYE" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe( - entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True} - ) +def test_entity_ruler_existing_overwrite(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}) ruler.add_patterns(patterns) nlp.add_pipe("add_ent", before="entity_ruler") doc = nlp("OH HELLO WORLD bye bye") @@ -266,11 +248,8 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory): assert doc.ents[1].label_ == "BYE" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe( - entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True} - ) +def test_entity_ruler_existing_complex(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}) ruler.add_patterns(patterns) nlp.add_pipe("add_ent", before="entity_ruler") doc = nlp("foo foo bye bye") @@ -281,11 +260,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory): assert len(doc.ents[1]) == 2 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe( - entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True} - ) +def test_entity_ruler_entity_id(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}) ruler.add_patterns(patterns) doc = nlp("Apple is a technology company") assert len(doc.ents) == 1 @@ -293,26 +269,23 @@ def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory): assert doc.ents[0].ent_id_ == "a1" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_cfg_ent_id_sep(nlp, patterns, entity_ruler_factory): +def test_entity_ruler_cfg_ent_id_sep(nlp, patterns): config = {"overwrite_ents": True, "ent_id_sep": "**"} - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler", config=config) + ruler = nlp.add_pipe("entity_ruler", config=config) ruler.add_patterns(patterns) doc = nlp("Apple is a technology company") - if isinstance(ruler, EntityRuler): - assert "TECH_ORG**a1" in ruler.phrase_patterns assert len(doc.ents) == 1 assert doc.ents[0].label_ == "TECH_ORG" assert doc.ents[0].ent_id_ == "a1" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory): - ruler = EntityRuler(nlp, patterns=patterns) +def test_entity_ruler_serialize_bytes(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 ruler_bytes = ruler.to_bytes() - new_ruler = EntityRuler(nlp) + new_ruler = nlp.add_pipe("entity_ruler", name="new_ruler") assert len(new_ruler) == 0 assert len(new_ruler.labels) == 0 new_ruler = new_ruler.from_bytes(ruler_bytes) @@ -324,28 +297,27 @@ def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory): assert sorted(new_ruler.labels) == sorted(ruler.labels) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_serialize_phrase_matcher_attr_bytes( - nlp, patterns, entity_ruler_factory -): - ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER", patterns=patterns) +def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) + ruler.add_patterns(patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 ruler_bytes = ruler.to_bytes() - new_ruler = EntityRuler(nlp) + new_ruler = nlp.add_pipe( + "entity_ruler", name="new_ruler", config={"phrase_matcher_attr": "LOWER"} + ) assert len(new_ruler) == 0 assert len(new_ruler.labels) == 0 - assert new_ruler.phrase_matcher_attr is None new_ruler = new_ruler.from_bytes(ruler_bytes) assert len(new_ruler) == len(patterns) assert len(new_ruler.labels) == 4 - assert new_ruler.phrase_matcher_attr == "LOWER" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_validate(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") - validated_ruler = EntityRuler(nlp, validate=True) +def test_entity_ruler_validate(nlp): + ruler = nlp.add_pipe("entity_ruler") + validated_ruler = nlp.add_pipe( + "entity_ruler", name="validated_ruler", config={"validate": True} + ) valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]} invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]} @@ -362,16 +334,15 @@ def test_entity_ruler_validate(nlp, entity_ruler_factory): validated_ruler.add_patterns([invalid_pattern]) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_properties(nlp, patterns, entity_ruler_factory): - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) +def test_entity_ruler_properties(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}) + ruler.add_patterns(patterns) assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"]) - assert sorted(ruler.ent_ids) == ["a1", "a2"] + assert sorted(ruler.ids) == ["a1", "a2"] -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_overlapping_spans(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "FOOBAR", "pattern": "foo bar"}, {"label": "BARBAZ", "pattern": "bar baz"}, @@ -383,14 +354,13 @@ def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory): @pytest.mark.parametrize("n_process", [1, 2]) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory): +def test_entity_ruler_multiprocessing(nlp, n_process): if isinstance(get_current_ops, NumpyOps) or n_process < 2: texts = ["I enjoy eating Pizza Hut pizza."] patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}] - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) for doc in nlp.pipe(texts, n_process=2): @@ -398,9 +368,8 @@ def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory): assert ent.ent_id_ == "1234" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_serialize_jsonl(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) with make_tempdir() as d: ruler.to_disk(d / "test_ruler.jsonl") @@ -409,9 +378,8 @@ def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory): ruler.from_disk(d / "non_existing.jsonl") # read from a bad jsonl file -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_serialize_dir(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) with make_tempdir() as d: ruler.to_disk(d / "test_ruler") @@ -420,9 +388,8 @@ def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory): ruler.from_disk(d / "non_existing_dir") # read from a bad directory -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_basic(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_basic(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "ACME", "id": "acme"}, @@ -432,24 +399,16 @@ def test_entity_ruler_remove_basic(nlp, entity_ruler_factory): doc = nlp("Dina went to school") assert len(ruler.patterns) == 3 assert len(doc.ents) == 1 - if isinstance(ruler, EntityRuler): - assert "PERSON||dina" in ruler.phrase_matcher assert doc.ents[0].label_ == "PERSON" assert doc.ents[0].text == "Dina" - if isinstance(ruler, EntityRuler): - ruler.remove("dina") - else: - ruler.remove_by_id("dina") + ruler.remove_by_id("dina") doc = nlp("Dina went to school") assert len(doc.ents) == 0 - if isinstance(ruler, EntityRuler): - assert "PERSON||dina" not in ruler.phrase_matcher assert len(ruler.patterns) == 2 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_same_id_multiple_patterns(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "DinaCorp", "id": "dina"}, @@ -458,25 +417,15 @@ def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory ruler.add_patterns(patterns) doc = nlp("Dina founded DinaCorp and ACME.") assert len(ruler.patterns) == 3 - if isinstance(ruler, EntityRuler): - assert "PERSON||dina" in ruler.phrase_matcher - assert "ORG||dina" in ruler.phrase_matcher assert len(doc.ents) == 3 - if isinstance(ruler, EntityRuler): - ruler.remove("dina") - else: - ruler.remove_by_id("dina") + ruler.remove_by_id("dina") doc = nlp("Dina founded DinaCorp and ACME.") assert len(ruler.patterns) == 1 - if isinstance(ruler, EntityRuler): - assert "PERSON||dina" not in ruler.phrase_matcher - assert "ORG||dina" not in ruler.phrase_matcher assert len(doc.ents) == 1 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_nonexisting_pattern(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "ACME", "id": "acme"}, @@ -491,9 +440,8 @@ def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory): ruler.remove_by_id("nepattern") -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_several_patterns(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "ACME", "id": "acme"}, @@ -507,27 +455,20 @@ def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory): assert doc.ents[0].text == "Dina" assert doc.ents[1].label_ == "ORG" assert doc.ents[1].text == "ACME" - if isinstance(ruler, EntityRuler): - ruler.remove("dina") - else: - ruler.remove_by_id("dina") + ruler.remove_by_id("dina") doc = nlp("Dina founded her company ACME") assert len(ruler.patterns) == 2 assert len(doc.ents) == 1 assert doc.ents[0].label_ == "ORG" assert doc.ents[0].text == "ACME" - if isinstance(ruler, EntityRuler): - ruler.remove("acme") - else: - ruler.remove_by_id("acme") + ruler.remove_by_id("acme") doc = nlp("Dina founded her company ACME") assert len(ruler.patterns) == 1 assert len(doc.ents) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_patterns_in_a_row(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "ACME", "id": "acme"}, @@ -543,21 +484,15 @@ def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory): assert doc.ents[1].text == "ACME" assert doc.ents[2].label_ == "DATE" assert doc.ents[2].text == "her birthday" - if isinstance(ruler, EntityRuler): - ruler.remove("dina") - ruler.remove("acme") - ruler.remove("bday") - else: - ruler.remove_by_id("dina") - ruler.remove_by_id("acme") - ruler.remove_by_id("bday") + ruler.remove_by_id("dina") + ruler.remove_by_id("acme") + ruler.remove_by_id("bday") doc = nlp("Dina went to school") assert len(doc.ents) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_all_patterns(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "ACME", "id": "acme"}, @@ -565,29 +500,19 @@ def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory): ] ruler.add_patterns(patterns) assert len(ruler.patterns) == 3 - if isinstance(ruler, EntityRuler): - ruler.remove("dina") - else: - ruler.remove_by_id("dina") + ruler.remove_by_id("dina") assert len(ruler.patterns) == 2 - if isinstance(ruler, EntityRuler): - ruler.remove("acme") - else: - ruler.remove_by_id("acme") + ruler.remove_by_id("acme") assert len(ruler.patterns) == 1 - if isinstance(ruler, EntityRuler): - ruler.remove("bday") - else: - ruler.remove_by_id("bday") + ruler.remove_by_id("bday") assert len(ruler.patterns) == 0 with pytest.warns(UserWarning): doc = nlp("Dina founded her company ACME on her birthday") assert len(doc.ents) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_and_add(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [{"label": "DATE", "pattern": "last time"}] ruler.add_patterns(patterns) doc = ruler( @@ -608,10 +533,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory): assert doc.ents[0].text == "last time" assert doc.ents[1].label_ == "DATE" assert doc.ents[1].text == "this time" - if isinstance(ruler, EntityRuler): - ruler.remove("ttime") - else: - ruler.remove_by_id("ttime") + ruler.remove_by_id("ttime") doc = ruler( nlp.make_doc("I saw him last time we met, this time he brought some flowers") ) @@ -634,10 +556,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory): ) assert len(ruler.patterns) == 3 assert len(doc.ents) == 3 - if isinstance(ruler, EntityRuler): - ruler.remove("ttime") - else: - ruler.remove_by_id("ttime") + ruler.remove_by_id("ttime") doc = ruler( nlp.make_doc( "I saw him last time we met, this time he brought some flowers, another time some chocolate." diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index b948bb76c..e49882441 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -8,7 +8,7 @@ import spacy from spacy import Vocab, load, registry from spacy.lang.en import English from spacy.language import Language -from spacy.pipeline import DependencyParser, EntityRecognizer, EntityRuler +from spacy.pipeline import DependencyParser, EntityRecognizer from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer from spacy.pipeline import TrainablePipe from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL @@ -85,58 +85,17 @@ def test_issue_3526_1(en_vocab): {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, ] nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}) + ruler.add_patterns(patterns) ruler_bytes = ruler.to_bytes() assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 - assert ruler.overwrite - new_ruler = EntityRuler(nlp) + new_ruler = nlp.add_pipe( + "entity_ruler", name="new_ruler", config={"overwrite_ents": True} + ) new_ruler = new_ruler.from_bytes(ruler_bytes) assert len(new_ruler) == len(ruler) assert len(new_ruler.labels) == 4 - assert new_ruler.overwrite == ruler.overwrite - assert new_ruler.ent_id_sep == ruler.ent_id_sep - - -@pytest.mark.issue(3526) -def test_issue_3526_2(en_vocab): - patterns = [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - bytes_old_style = srsly.msgpack_dumps(ruler.patterns) - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(bytes_old_style) - assert len(new_ruler) == len(ruler) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert new_ruler.overwrite is not ruler.overwrite - - -@pytest.mark.issue(3526) -def test_issue_3526_3(en_vocab): - patterns = [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - with make_tempdir() as tmpdir: - out_file = tmpdir / "entity_ruler" - srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) - new_ruler = EntityRuler(nlp).from_disk(out_file) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert len(new_ruler) == len(ruler) - assert new_ruler.overwrite is not ruler.overwrite @pytest.mark.issue(3526) @@ -150,16 +109,14 @@ def test_issue_3526_4(en_vocab): nlp.to_disk(tmpdir) ruler = nlp.get_pipe("entity_ruler") assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert ruler.overwrite is True nlp2 = load(tmpdir) new_ruler = nlp2.get_pipe("entity_ruler") assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert new_ruler.overwrite is True @pytest.mark.issue(4042) def test_issue4042(): - """Test that serialization of an EntityRuler before NER works fine.""" + """Test that serialization of an entity_ruler before NER works fine.""" nlp = English() # add ner pipe ner = nlp.add_pipe("ner") diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index ef7acbbf1..651c87585 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -1,13 +1,24 @@ --- title: EntityRuler -tag: class -source: spacy/pipeline/entity_ruler.py new: 2.1 teaser: 'Pipeline component for rule-based named entity recognition' api_string_name: entity_ruler api_trainable: false --- + + +As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is +implemented as a special case of the `SpanRuler` component. + +See the [migration guide](#migrating) below for differences between the v3 +`EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler` +component. + +See the [`SpanRuler`](/api/spanruler) API docs for the full API. + + + The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using token-based rules or exact phrase matches. It can be combined with the statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or @@ -63,271 +74,51 @@ how the component should be configured. You can override its settings via the | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | | `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | -```python -%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py +## Migrating from v3 {#migrating} + +### Loading patterns + +Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on +initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file +path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the +JSONL file separately and then added through +[`SpanRuler.initialize`](/api/spanruler#initialize]) or +[`SpanRuler.add_patterns`](/api/spanruler#add_patterns). + +```diff + ruler = nlp.get_pipe("entity_ruler") +- ruler.from_disk("patterns.jsonl") ++ import srsly ++ patterns = srsly.read_jsonl("patterns.jsonl") ++ ruler.add_patterns(patterns) ``` -## EntityRuler.\_\_init\_\_ {#init tag="method"} +### Saving patterns -Initialize the entity ruler. If patterns are supplied here, they need to be a -list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either -be a token pattern (list) or a phrase pattern (string). For example: -`{"label": "ORG", "pattern": "Apple"}`. +`SpanRuler.to_disk` always saves the full component data to a directory and does +not include an option to save the patterns to a single JSONL file. -> #### Example -> -> ```python -> # Construction via add_pipe -> ruler = nlp.add_pipe("entity_ruler") -> -> # Construction from class -> from spacy.pipeline import EntityRuler -> ruler = EntityRuler(nlp, overwrite_ents=True) -> ``` +```diff + ruler = nlp.get_pipe("entity_ruler") +- ruler.to_disk("patterns.jsonl") ++ import srsly ++ srsly.write_jsonl("patterns.jsonl", ruler.patterns) +``` -| Name | Description | -| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ | -| `name` 3 | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ | -| _keyword-only_ | | -| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | -| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | -| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | -| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ | +### Accessing token and phrase patterns -## EntityRuler.initialize {#initialize tag="method" new="3"} +The separate token patterns and phrase patterns are no longer accessible under +`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined +patterns in their original format using the property +[`SpanRuler.patterns`](/api/spanruler#patterns). -Initialize the component with data and used before training to load in rules -from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method -is typically called by [`Language.initialize`](/api/language#initialize) and -lets you customize arguments it receives via the -[`[initialize.components]`](/api/data-formats#config-initialize) block in the -config. +### Removing patterns by ID -> #### Example -> -> ```python -> entity_ruler = nlp.add_pipe("entity_ruler") -> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns) -> ``` -> -> ```ini -> ### config.cfg -> [initialize.components.entity_ruler] -> -> [initialize.components.entity_ruler.patterns] -> @readers = "srsly.read_jsonl.v1" -> path = "corpus/entity_ruler_patterns.jsonl -> ``` +[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To +remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id): -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `patterns` | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~ | - -## EntityRuler.\_\len\_\_ {#len tag="method"} - -The number of all patterns added to the entity ruler. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> assert len(ruler) == 0 -> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) -> assert len(ruler) == 1 -> ``` - -| Name | Description | -| ----------- | ------------------------------- | -| **RETURNS** | The number of patterns. ~~int~~ | - -## EntityRuler.\_\_contains\_\_ {#contains tag="method"} - -Whether a label is present in the patterns. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) -> assert "ORG" in ruler -> assert not "PERSON" in ruler -> ``` - -| Name | Description | -| ----------- | ----------------------------------------------------- | -| `label` | The label to check. ~~str~~ | -| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ | - -## EntityRuler.\_\_call\_\_ {#call tag="method"} - -Find matches in the `Doc` and add them to the `doc.ents`. Typically, this -happens automatically after the component has been added to the pipeline using -[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized -with `overwrite_ents=True`, existing entities will be replaced if they overlap -with the matches. When matches overlap in a Doc, the entity ruler prioritizes -longer patterns over shorter, and if equal the match occuring first in the Doc -is chosen. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) -> -> doc = nlp("A text about Apple.") -> ents = [(ent.text, ent.label_) for ent in doc.ents] -> assert ents == [("Apple", "ORG")] -> ``` - -| Name | Description | -| ----------- | -------------------------------------------------------------------- | -| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ | -| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~ | - -## EntityRuler.add_patterns {#add_patterns tag="method"} - -Add patterns to the entity ruler. A pattern can either be a token pattern (list -of dicts) or a phrase pattern (string). For more details, see the usage guide on -[rule-based matching](/usage/rule-based-matching). - -> #### Example -> -> ```python -> patterns = [ -> {"label": "ORG", "pattern": "Apple"}, -> {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]} -> ] -> ruler = nlp.add_pipe("entity_ruler") -> ruler.add_patterns(patterns) -> ``` - -| Name | Description | -| ---------- | ---------------------------------------------------------------- | -| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ | - - -## EntityRuler.remove {#remove tag="method" new="3.2.1"} - -Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if the ID does not exist. - -> #### Example -> -> ```python -> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}] -> ruler = nlp.add_pipe("entity_ruler") -> ruler.add_patterns(patterns) -> ruler.remove("apple") -> ``` - -| Name | Description | -| ---------- | ---------------------------------------------------------------- | -| `id` | The ID of the pattern rule. ~~str~~ | - -## EntityRuler.to_disk {#to_disk tag="method"} - -Save the entity ruler patterns to a directory. The patterns will be saved as -newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided, -only the patterns are saved as JSONL. If a directory name is provided, a -`patterns.jsonl` and `cfg` file with the component configuration is exported. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> ruler.to_disk("/path/to/patterns.jsonl") # saves patterns only -> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config -> ``` - -| Name | Description | -| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | - -## EntityRuler.from_disk {#from_disk tag="method"} - -Load the entity ruler from a path. Expects either a file containing -newline-delimited JSON (JSONL) with one entry per line, or a directory -containing a `patterns.jsonl` file and a `cfg` file with the component -configuration. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> ruler.from_disk("/path/to/patterns.jsonl") # loads patterns only -> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------- | -| `path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | -| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ | - -## EntityRuler.to_bytes {#to_bytes tag="method"} - -Serialize the entity ruler patterns to a bytestring. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> ruler_bytes = ruler.to_bytes() -> ``` - -| Name | Description | -| ----------- | ---------------------------------- | -| **RETURNS** | The serialized patterns. ~~bytes~~ | - -## EntityRuler.from_bytes {#from_bytes tag="method"} - -Load the pipe from a bytestring. Modifies the object in place and returns it. - -> #### Example -> -> ```python -> ruler_bytes = ruler.to_bytes() -> ruler = nlp.add_pipe("entity_ruler") -> ruler.from_bytes(ruler_bytes) -> ``` - -| Name | Description | -| ------------ | -------------------------------------------------- | -| `bytes_data` | The bytestring to load. ~~bytes~~ | -| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ | - -## EntityRuler.labels {#labels tag="property"} - -All labels present in the match patterns. - -| Name | Description | -| ----------- | -------------------------------------- | -| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ | - -## EntityRuler.ent_ids {#ent_ids tag="property" new="2.2.2"} - -All entity IDs present in the `id` properties of the match patterns. - -| Name | Description | -| ----------- | ----------------------------------- | -| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ | - -## EntityRuler.patterns {#patterns tag="property"} - -Get all patterns that were added to the entity ruler. - -| Name | Description | -| ----------- | ---------------------------------------------------------------------------------------- | -| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ | - -## Attributes {#attributes} - -| Name | Description | -| ----------------- | --------------------------------------------------------------------------------------------------------------------- | -| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ | -| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ | -| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ | -| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ | +```diff + ruler = nlp.get_pipe("entity_ruler") +- ruler.remove("id") ++ ruler.remove_by_id("id") +``` diff --git a/website/docs/api/spanruler.md b/website/docs/api/spanruler.md index b573f7c58..1339d0967 100644 --- a/website/docs/api/spanruler.md +++ b/website/docs/api/spanruler.md @@ -13,6 +13,17 @@ The span ruler lets you add spans to [`Doc.spans`](/api/doc#spans) and/or usage examples, see the docs on [rule-based span matching](/usage/rule-based-matching#spanruler). + + +As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is +implemented as a special case of the `SpanRuler` component. + +See the [migration guide](/api/entityruler#migrating) for differences between +the v3 `EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler` +component. + + + ## Assigned Attributes {#assigned-attributes} Matches will be saved to `Doc.spans[spans_key]` as a diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md index 4ebca2756..ecc7f2fd9 100644 --- a/website/docs/usage/101/_architecture.md +++ b/website/docs/usage/101/_architecture.md @@ -41,25 +41,27 @@ components for different language processing tasks and also allows adding ![The processing pipeline](../../images/pipeline.svg) -| Name | Description | -| ----------------------------------------------- | ------------------------------------------------------------------------------------------- | -| [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. | -| [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. | -| [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words. | -| [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. | -| [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. | -| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. | -| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words using rules and lookups. | -| [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. | -| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. | -| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. | -| [`Tagger`](/api/tagger) | Predict part-of-speech tags. | -| [`TextCategorizer`](/api/textcategorizer) | Predict categories or labels over the whole document. | -| [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. | -| [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. | -| [`TrainablePipe`](/api/pipe) | Class that all trainable pipeline components inherit from. | -| [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. | -| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | +| Component name | Component class | Description | +| ---------------------- | ---------------------------------------------------- | ------------------------------------------------------------------------------------------- | +| `attribute_ruler` | [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. | +| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. | +| `entity_ruler` | [`SpanRuler`](/api/spanruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. | +| `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words using rules and lookups. | +| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. | +| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. | +| `parser` | [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. | +| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. | +| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. | +| `span_ruler` | [`SpanRuler`](/api/spanruler) | Add spans to the `Doc` using token-based rules or exact phrase matches. | +| `tagger` | [`Tagger`](/api/tagger) | Predict part-of-speech tags. | +| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Predict exactly one category or label over a whole document. | +| `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Predict 0, 1 or more categories or labels over a whole document. | +| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. | +| `tokenizer` | [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. | +| `trainable_lemmatizer` | [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words. | +| `transformer` | [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. | +| - | [`TrainablePipe`](/api/pipe) | Class that all trainable pipeline components inherit from. | +| - | [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | ### Matchers {#architecture-matchers} diff --git a/website/docs/usage/101/_pipelines.md b/website/docs/usage/101/_pipelines.md index f43219f41..3a6d67a37 100644 --- a/website/docs/usage/101/_pipelines.md +++ b/website/docs/usage/101/_pipelines.md @@ -53,9 +53,9 @@ example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll only work if it's added after the tagger. The parser will respect pre-defined sentence boundaries, so if a previous component in the pipeline sets them, its dependency predictions may be different. Similarly, it matters if you add the -[`EntityRuler`](/api/entityruler) before or after the statistical entity -recognizer: if it's added before, the entity recognizer will take the existing -entities into account when making predictions. The +[`SpanRuler`](/api/spanruler) before or after the statistical entity recognizer: +if it's added before and it is writing to `doc.ents`, then the entity recognizer +will take those existing entities into account when making predictions. The [`EntityLinker`](/api/entitylinker), which resolves named entities to knowledge base IDs, should be preceded by a pipeline component that recognizes entities such as the [`EntityRecognizer`](/api/entityrecognizer). diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index bd28810ae..2463b523f 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -303,13 +303,14 @@ available pipeline components and component functions. > ruler = nlp.add_pipe("entity_ruler") > ``` -| String name | Component | Description | +| Component name | Component class | Description | | ---------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- | | `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | | `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | | `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | | `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | -| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. | +| `span_ruler` | [`SpanRuler`](/api/spanruler) | Assign spans based on pattern rules and dictionaries. | +| `entity_ruler` | [`SpanRuler`](/api/spanruler) | Assign named entities based on pattern rules and dictionaries. | | `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories: exactly one category is predicted per document. | | `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document. | | `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words using rules and lookups. | diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index bf1891df1..d9f551820 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -375,7 +375,7 @@ scoped quantifiers – instead, you can build those behaviors with `on_match` callbacks. | OP | Description | -|---------|------------------------------------------------------------------------| +| ------- | ---------------------------------------------------------------------- | | `!` | Negate the pattern, by requiring it to match exactly 0 times. | | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. | | `+` | Require the pattern to match 1 or more times. | @@ -471,7 +471,7 @@ matches = matcher(doc) ``` A very similar logic has been implemented in the built-in -[`EntityRuler`](/api/entityruler) by the way. It also takes care of handling +[`entity_ruler`](/api/entityruler) by the way. It also takes care of handling overlapping matches, which you would otherwise have to take care of yourself. > #### Tip: Visualizing matches @@ -1270,7 +1270,7 @@ of patterns such as `{}` that match any token in the sentence. ## Rule-based entity recognition {#entityruler new="2.1"} -The [`EntityRuler`](/api/entityruler) is a component that lets you add named +The [`entity_ruler`](/api/entityruler) is a component that lets you add named entities based on pattern dictionaries, which makes it easy to combine rule-based and statistical named entity recognition for even more powerful pipelines. @@ -1295,13 +1295,12 @@ pattern. The entity ruler accepts two types of patterns: ### Using the entity ruler {#entityruler-usage} -The [`EntityRuler`](/api/entityruler) is a pipeline component that's typically -added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is -called on a text, it will find matches in the `doc` and add them as entities to -the `doc.ents`, using the specified pattern label as the entity label. If any -matches were to overlap, the pattern matching most tokens takes priority. If -they also happen to be equally long, then the match occurring first in the `Doc` -is chosen. +The `entity_ruler` is a pipeline component that's typically added via +[`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is called on a +text, it will find matches in the `doc` and add them as entities to `doc.ents`, +using the specified pattern label as the entity label. If any matches were to +overlap, the pattern matching most tokens takes priority. If they also happen to +be equally long, then the match occurring first in the `Doc` is chosen. ```python ### {executable="true"} @@ -1339,7 +1338,7 @@ doc = nlp("MyCorp Inc. is a company in the U.S.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` -#### Validating and debugging EntityRuler patterns {#entityruler-pattern-validation new="2.1.8"} +#### Validating and debugging entity ruler patterns {#entityruler-pattern-validation new="2.1.8"} The entity ruler can validate patterns against a JSON schema with the config setting `"validate"`. See details under @@ -1351,9 +1350,9 @@ ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) ### Adding IDs to patterns {#entityruler-ent-ids new="2.2.2"} -The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each -pattern. Using the `id` attribute allows multiple patterns to be associated with -the same entity. +The [`entity_ruler`](/api/entityruler) can also accept an `id` attribute for +each pattern. Using the `id` attribute allows multiple patterns to be associated +with the same entity. ```python ### {executable="true"} @@ -1373,10 +1372,10 @@ doc2 = nlp("Apple is opening its first big office in San Fran.") print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents]) ``` -If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) -patterns, the `id_` property of the matched entity is set to the `id` given -in the patterns. So in the example above it's easy to identify that "San -Francisco" and "San Fran" are both the same entity. +If the `id` attribute is included in the [`entity_ruler`](/api/entityruler) +patterns, the `id_` property of the matched entity is set to the `id` given in +the patterns. So in the example above it's easy to identify that "San Francisco" +and "San Fran" are both the same entity. ### Using pattern files {#entityruler-files} @@ -1400,13 +1399,13 @@ new_ruler = nlp.add_pipe("entity_ruler").from_disk("./patterns.jsonl") If you're using the [Prodigy](https://prodi.gy) annotation tool, you might recognize these pattern files from bootstrapping your named entity and text -classification labelling. The patterns for the `EntityRuler` follow the same +classification labelling. The patterns for the `entity_ruler` follow the same syntax, so you can use your existing Prodigy pattern files in spaCy, and vice versa. -When you save out an `nlp` object that has an `EntityRuler` added to its +When you save out an `nlp` object that has an `entity_ruler` added to its pipeline, its patterns are automatically exported to the pipeline directory: ```python @@ -1429,9 +1428,9 @@ rules included! When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the entity ruler works. For -each **phrase pattern**, the EntityRuler calls the nlp object to construct a doc -object. This happens in case you try to add the EntityRuler at the end of an -existing pipeline with, for example, a POS tagger and want to extract matches +each **phrase pattern**, the entity ruler calls the nlp object to construct a +doc object. This happens in case you try to add the entity ruler at the end of +an existing pipeline with, for example, a POS tagger and want to extract matches based on the pattern's POS signature. In this case you would pass a config value of `"phrase_matcher_attr": "POS"` for the entity ruler. diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 9a4b584a3..d2b67b199 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -193,13 +193,13 @@ the data to and from a JSON file. > #### Real-world example > -> To see custom serialization methods in action, check out the new -> [`EntityRuler`](/api/entityruler) component and its -> [source](%%GITHUB_SPACY/spacy/pipeline/entity_ruler.py). Patterns added to the +> To see custom serialization methods in action, check out the +> [`SpanRuler`](/api/spanruler) component and its +> [source](%%GITHUB_SPACY/spacy/pipeline/span_ruler.py). Patterns added to the > component will be saved to a `.jsonl` file if the pipeline is serialized to > disk, and to a bytestring if the pipeline is serialized to bytes. This allows -> saving out a pipeline with a rule-based entity recognizer and including all -> rules _with_ the component data. +> saving out a pipeline with rule-based components _with_ all the component +> data. ```python ### {highlight="16-23,25-30"} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 27a8bbca7..5ee148224 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -424,7 +424,7 @@ your components during training, and the most common scenarios are: 2. Update an existing **trained component** with more examples. 3. Include an existing trained component without updating it. 4. Include a non-trainable component, like a rule-based - [`EntityRuler`](/api/entityruler) or [`Sentencizer`](/api/sentencizer), or a + [`SpanRuler`](/api/spanruler) or [`Sentencizer`](/api/sentencizer), or a fully [custom component](/usage/processing-pipelines#custom-components). If a component block defines a `factory`, spaCy will look it up in the