Entity ruler remove pattern (#9685)

* added ruler coe

* added error for none existing pattern

* changed error to warning

* changed error to warning

* added basic tests

* fixed place

* added test files

* went back to error

* went back to pattern error

* minor change to docs

* changed style

* changed doc

* changed error slightly

* added remove to phrasem api

* error key already existed

* phrase matcher match code to api

* blacked tests

* moved comments before expr

* corrected error no

* Update website/docs/api/entityruler.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/entityruler.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Duygu Altinok 2021-12-06 15:32:49 +01:00 committed by GitHub
parent 472740d613
commit b56b9e7f31
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 252 additions and 6 deletions

View File

@ -889,6 +889,8 @@ class Errors(metaclass=ErrorsWithCodes):
"Non-UD tags should use the `tag` property.")
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -8,12 +8,9 @@ class PhraseMatcher:
def __init__(
self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ...
) -> None: ...
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: bool = ...,
) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
def __reduce__(self) -> Any: ...
def __len__(self) -> int: ...
def __contains__(self, key: str) -> bool: ...
def add(
self,
key: str,
@ -23,3 +20,10 @@ class PhraseMatcher:
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
] = ...,
) -> None: ...
def remove(self, key: str) -> None: ...
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: bool = ...,
) -> Union[List[Tuple[int, int, int]], List[Span]]: ...

View File

@ -348,6 +348,46 @@ class EntityRuler(Pipe):
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
)
def remove(self, ent_id: str) -> None:
"""Remove a pattern by its ent_id if a pattern with this ent_id was added before
ent_id (str): id of the pattern to be removed
RETURNS: None
DOCS: https://spacy.io/api/entityruler#remove
"""
label_id_pairs = [
(label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id
]
if not label_id_pairs:
raise ValueError(Errors.E1024.format(ent_id=ent_id))
created_labels = [
self._create_label(label, eid) for (label, eid) in label_id_pairs
]
# remove the patterns from self.phrase_patterns
self.phrase_patterns = defaultdict(
list,
{
label: val
for (label, val) in self.phrase_patterns.items()
if label not in created_labels
},
)
# remove the patterns from self.token_pattern
self.token_patterns = defaultdict(
list,
{
label: val
for (label, val) in self.token_patterns.items()
if label not in created_labels
},
)
# remove the patterns from self.token_pattern
for label in created_labels:
if label in self.phrase_matcher:
self.phrase_matcher.remove(label)
else:
self.matcher.remove(label)
def _require_patterns(self) -> None:
"""Raise a warning if this component has no patterns defined."""
if len(self) == 0:

View File

@ -373,3 +373,185 @@ def test_entity_ruler_serialize_dir(nlp, patterns):
ruler.from_disk(d / "test_ruler") # read from an existing directory
with pytest.raises(ValueError):
ruler.from_disk(d / "non_existing_dir") # read from a bad directory
def test_entity_ruler_remove_basic(nlp):
ruler = EntityRuler(nlp)
patterns = [
{"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
{"label": "ORG", "pattern": "ACME", "id": "acme"},
{"label": "ORG", "pattern": "ACM"},
]
ruler.add_patterns(patterns)
doc = ruler(nlp.make_doc("Duygu went to school"))
assert len(ruler.patterns) == 3
assert len(doc.ents) == 1
assert doc.ents[0].label_ == "PERSON"
assert doc.ents[0].text == "Duygu"
assert "PERSON||duygu" in ruler.phrase_matcher
ruler.remove("duygu")
doc = ruler(nlp.make_doc("Duygu went to school"))
assert len(doc.ents) == 0
assert "PERSON||duygu" not in ruler.phrase_matcher
assert len(ruler.patterns) == 2
def test_entity_ruler_remove_same_id_multiple_patterns(nlp):
ruler = EntityRuler(nlp)
patterns = [
{"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
{"label": "ORG", "pattern": "DuyguCorp", "id": "duygu"},
{"label": "ORG", "pattern": "ACME", "id": "acme"},
]
ruler.add_patterns(patterns)
doc = ruler(nlp.make_doc("Duygu founded DuyguCorp and ACME."))
assert len(ruler.patterns) == 3
assert "PERSON||duygu" in ruler.phrase_matcher
assert "ORG||duygu" in ruler.phrase_matcher
assert len(doc.ents) == 3
ruler.remove("duygu")
doc = ruler(nlp.make_doc("Duygu founded DuyguCorp and ACME."))
assert len(ruler.patterns) == 1
assert "PERSON||duygu" not in ruler.phrase_matcher
assert "ORG||duygu" not in ruler.phrase_matcher
assert len(doc.ents) == 1
def test_entity_ruler_remove_nonexisting_pattern(nlp):
ruler = EntityRuler(nlp)
patterns = [
{"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
{"label": "ORG", "pattern": "ACME", "id": "acme"},
{"label": "ORG", "pattern": "ACM"},
]
ruler.add_patterns(patterns)
assert len(ruler.patterns) == 3
with pytest.raises(ValueError):
ruler.remove("nepattern")
assert len(ruler.patterns) == 3
def test_entity_ruler_remove_several_patterns(nlp):
ruler = EntityRuler(nlp)
patterns = [
{"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
{"label": "ORG", "pattern": "ACME", "id": "acme"},
{"label": "ORG", "pattern": "ACM"},
]
ruler.add_patterns(patterns)
doc = ruler(nlp.make_doc("Duygu founded her company ACME."))
assert len(ruler.patterns) == 3
assert len(doc.ents) == 2
assert doc.ents[0].label_ == "PERSON"
assert doc.ents[0].text == "Duygu"
assert doc.ents[1].label_ == "ORG"
assert doc.ents[1].text == "ACME"
ruler.remove("duygu")
doc = ruler(nlp.make_doc("Duygu founded her company ACME"))
assert len(ruler.patterns) == 2
assert len(doc.ents) == 1
assert doc.ents[0].label_ == "ORG"
assert doc.ents[0].text == "ACME"
ruler.remove("acme")
doc = ruler(nlp.make_doc("Duygu founded her company ACME"))
assert len(ruler.patterns) == 1
assert len(doc.ents) == 0
def test_entity_ruler_remove_patterns_in_a_row(nlp):
ruler = EntityRuler(nlp)
patterns = [
{"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
{"label": "ORG", "pattern": "ACME", "id": "acme"},
{"label": "DATE", "pattern": "her birthday", "id": "bday"},
{"label": "ORG", "pattern": "ACM"},
]
ruler.add_patterns(patterns)
doc = ruler(nlp.make_doc("Duygu founded her company ACME on her birthday"))
assert len(doc.ents) == 3
assert doc.ents[0].label_ == "PERSON"
assert doc.ents[0].text == "Duygu"
assert doc.ents[1].label_ == "ORG"
assert doc.ents[1].text == "ACME"
assert doc.ents[2].label_ == "DATE"
assert doc.ents[2].text == "her birthday"
ruler.remove("duygu")
ruler.remove("acme")
ruler.remove("bday")
doc = ruler(nlp.make_doc("Duygu went to school"))
assert len(doc.ents) == 0
def test_entity_ruler_remove_all_patterns(nlp):
ruler = EntityRuler(nlp)
patterns = [
{"label": "PERSON", "pattern": "Duygu", "id": "duygu"},
{"label": "ORG", "pattern": "ACME", "id": "acme"},
{"label": "DATE", "pattern": "her birthday", "id": "bday"},
]
ruler.add_patterns(patterns)
assert len(ruler.patterns) == 3
ruler.remove("duygu")
assert len(ruler.patterns) == 2
ruler.remove("acme")
assert len(ruler.patterns) == 1
ruler.remove("bday")
assert len(ruler.patterns) == 0
with pytest.warns(UserWarning):
doc = ruler(nlp.make_doc("Duygu founded her company ACME on her birthday"))
assert len(doc.ents) == 0
def test_entity_ruler_remove_and_add(nlp):
ruler = EntityRuler(nlp)
patterns = [{"label": "DATE", "pattern": "last time"}]
ruler.add_patterns(patterns)
doc = ruler(
nlp.make_doc("I saw him last time we met, this time he brought some flowers")
)
assert len(ruler.patterns) == 1
assert len(doc.ents) == 1
assert doc.ents[0].label_ == "DATE"
assert doc.ents[0].text == "last time"
patterns1 = [{"label": "DATE", "pattern": "this time", "id": "ttime"}]
ruler.add_patterns(patterns1)
doc = ruler(
nlp.make_doc("I saw him last time we met, this time he brought some flowers")
)
assert len(ruler.patterns) == 2
assert len(doc.ents) == 2
assert doc.ents[0].label_ == "DATE"
assert doc.ents[0].text == "last time"
assert doc.ents[1].label_ == "DATE"
assert doc.ents[1].text == "this time"
ruler.remove("ttime")
doc = ruler(
nlp.make_doc("I saw him last time we met, this time he brought some flowers")
)
assert len(ruler.patterns) == 1
assert len(doc.ents) == 1
assert doc.ents[0].label_ == "DATE"
assert doc.ents[0].text == "last time"
ruler.add_patterns(patterns1)
doc = ruler(
nlp.make_doc("I saw him last time we met, this time he brought some flowers")
)
assert len(ruler.patterns) == 2
assert len(doc.ents) == 2
patterns2 = [{"label": "DATE", "pattern": "another time", "id": "ttime"}]
ruler.add_patterns(patterns2)
doc = ruler(
nlp.make_doc(
"I saw him last time we met, this time he brought some flowers, another time some chocolate."
)
)
assert len(ruler.patterns) == 3
assert len(doc.ents) == 3
ruler.remove("ttime")
doc = ruler(
nlp.make_doc(
"I saw him last time we met, this time he brought some flowers, another time some chocolate."
)
)
assert len(ruler.patterns) == 1
assert len(doc.ents) == 1

View File

@ -210,6 +210,24 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on
| ---------- | ---------------------------------------------------------------- |
| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
## EntityRuler.remove {#remove tag="method" new="3.2.1"}
Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if the ID does not exist.
> #### Example
>
> ```python
> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}]
> ruler = nlp.add_pipe("entity_ruler")
> ruler.add_patterns(patterns)
> ruler.remove("apple")
> ```
| Name | Description |
| ---------- | ---------------------------------------------------------------- |
| `id` | The ID of the pattern rule. ~~str~~ |
## EntityRuler.to_disk {#to_disk tag="method"}
Save the entity ruler patterns to a directory. The patterns will be saved as