Extend AttributeRuler functionality

* Add option to initialize with a dict of AttributeRuler patterns

* Instead of silently discarding overlapping matches (the default
behavior for the retokenizer if only the attrs differ), split the
matches into disjoint sets and retokenize each set separately. This
allows, for instance, one pattern to set the POS and another pattern to
set the lemma. (If two matches modify the same attribute, it looks like
the attrs are applied in the order they were added, but it may not be
deterministic?)

* Improve types
This commit is contained in:
Adriane Boyd 2020-07-30 11:17:33 +02:00
parent 352b918356
commit ca33e891e2
2 changed files with 122 additions and 36 deletions

View File

@ -1,5 +1,5 @@
import srsly
from typing import List, Dict, Union, Iterable
from typing import List, Dict, Union, Iterable, Any
from pathlib import Path
from .pipe import Pipe
@ -7,11 +7,14 @@ from ..errors import Errors
from ..language import Language
from ..matcher import Matcher
from ..symbols import IDS
from ..tokens import Doc
from ..tokens import Doc, Span
from ..vocab import Vocab
from .. import util
MatcherPatternType = List[Dict[Union[int, str], Any]]
@Language.factory(
"attribute_ruler",
assigns=[],
@ -20,9 +23,9 @@ from .. import util
default_score_weights={},
)
def make_attribute_ruler(
nlp: Language, name: str,
nlp: Language, name: str, pattern_dicts: Iterable[Dict] = tuple()
):
return AttributeRuler(nlp.vocab, name)
return AttributeRuler(nlp.vocab, name, pattern_dicts=pattern_dicts)
class AttributeRuler(Pipe):
@ -32,10 +35,22 @@ class AttributeRuler(Pipe):
DOCS: https://spacy.io/api/attributeruler
"""
def __init__(self, vocab: Vocab, name: str = "attribute_ruler") -> None:
"""Initialize the attributeruler.
def __init__(
self,
vocab: Vocab,
name: str = "attribute_ruler",
*,
pattern_dicts: List[Dict[str, Union[List[MatcherPatternType], Dict, int]]] = {},
) -> None:
"""Initialize the AttributeRuler.
RETURNS (AttributeRuler): The attributeruler component.
vocab (Vocab): The vocab.
name (str): The pipe name. Defaults to "attribute_ruler".
pattern_dicts (List[Dict]): A list of pattern dicts with the keys as
the arguments to AttributeRuler.add (`patterns`/`attrs`/`index`) to add
as patterns.
RETURNS (AttributeRuler): The AttributeRuler component.
DOCS: https://spacy.io/api/attributeruler#init
"""
@ -45,6 +60,9 @@ class AttributeRuler(Pipe):
self.attrs = []
self.indices = []
for p in pattern_dicts:
self.add(**p)
def __call__(self, doc: Doc) -> Doc:
"""Apply the attributeruler to a Doc and set all attribute exceptions.
@ -54,21 +72,36 @@ class AttributeRuler(Pipe):
DOCS: https://spacy.io/api/attributeruler#call
"""
matches = self.matcher(doc)
with doc.retokenize() as retokenizer:
for match_id, start, end in matches:
attrs = self.attrs[match_id]
index = self.indices[match_id]
token = doc[start:end][index]
if start <= token.i < end:
retokenizer.merge(doc[token.i : token.i + 1], attrs)
else:
raise ValueError(
Errors.E1001.format(
patterns=self.matcher.get(match_id),
span=[t.text for t in doc[start:end]],
index=index,
# Multiple patterns may apply to the same token but the retokenizer can
# only handle one merge per token, so split the matches into sets of
# disjoint spans.
original_spans = set(
[Span(doc, start, end, label=match_id) for match_id, start, end in matches]
)
disjoint_span_sets = []
while original_spans:
filtered_spans = set(util.filter_spans(original_spans))
disjoint_span_sets.append(filtered_spans)
original_spans -= filtered_spans
# Retokenize with each set of disjoint spans separately
for span_set in disjoint_span_sets:
with doc.retokenize() as retokenizer:
for span in span_set:
attrs = self.attrs[span.label]
index = self.indices[span.label]
token = span[index]
if span.start <= token.i < span.end:
retokenizer.merge(doc[token.i : token.i + 1], attrs)
else:
raise ValueError(
Errors.E1001.format(
patterns=self.matcher.get(span.label),
span=[t.text for t in span],
index=index,
)
)
)
return doc
def load_from_tag_map(
@ -93,12 +126,14 @@ class AttributeRuler(Pipe):
attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs)
def add(self, patterns: List[List[Dict]], attrs: Dict, index: int = 0) -> None:
def add(
self, patterns: Iterable[MatcherPatternType], attrs: Dict, index: int = 0
) -> None:
"""Add Matcher patterns for tokens that should be modified with the
provided attributes. The token at the specified index within the
matched span will be assigned the attributes.
pattern (List[List[Dict]]): A list of Matcher patterns.
pattern (Iterable[List[Dict]]): A list of Matcher patterns.
attrs (Dict): The attributes to assign to the target token in the
matched span.
index (int): The index of the token in the matched span to modify. May

View File

@ -12,6 +12,24 @@ def nlp():
return English()
@pytest.fixture
def pattern_dicts():
return [
{
"patterns": [[{"ORTH": "a"}]],
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
},
# one pattern sets the lemma
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
# another pattern sets the morphology
{
"patterns": [[{"ORTH": "test"}]],
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
"index": 0,
},
]
@pytest.fixture
def tag_map():
return {
@ -25,13 +43,21 @@ def morph_rules():
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
def test_attributeruler_init(nlp):
a = AttributeRuler(nlp.vocab)
def test_attributeruler_init(nlp, pattern_dicts):
a = nlp.add_pipe("attribute_ruler")
a.add([[{"ORTH": "a"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"})
a.add([[{"ORTH": "test"}]], {"LEMMA": "cat", "MORPH": "Number=Sing|Case=Nom"})
a.add([[{"ORTH": "test"}]], {"LEMMA": "dog"})
for p in pattern_dicts:
a.add(**p)
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing"
def test_attributeruler_init_patterns(nlp, pattern_dicts):
# initialize with patterns
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
@ -43,7 +69,11 @@ def test_attributeruler_init(nlp):
def test_attributeruler_tag_map(nlp, tag_map):
a = AttributeRuler(nlp.vocab)
a.load_from_tag_map(tag_map)
doc = get_doc(nlp.vocab, words=["This", "is", "a", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])
doc = get_doc(
nlp.vocab,
words=["This", "is", "a", "test", "."],
tags=["DT", "VBZ", "DT", "NN", "."],
)
doc = a(doc)
for i in range(len(doc)):
@ -58,7 +88,11 @@ def test_attributeruler_tag_map(nlp, tag_map):
def test_attributeruler_morph_rules(nlp, morph_rules):
a = AttributeRuler(nlp.vocab)
a.load_from_morph_rules(morph_rules)
doc = get_doc(nlp.vocab, words=["This", "is", "the", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])
doc = get_doc(
nlp.vocab,
words=["This", "is", "the", "test", "."],
tags=["DT", "VBZ", "DT", "NN", "."],
)
doc = a(doc)
for i in range(len(doc)):
@ -73,8 +107,16 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
def test_attributeruler_indices(nlp):
a = nlp.add_pipe("attribute_ruler")
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, index=0)
a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, index=1)
a.add(
[[{"ORTH": "a"}, {"ORTH": "test"}]],
{"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
index=0,
)
a.add(
[[{"ORTH": "This"}, {"ORTH": "is"}]],
{"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"},
index=1,
)
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
text = "This is a test."
@ -97,10 +139,19 @@ def test_attributeruler_indices(nlp):
with pytest.raises(ValueError):
doc = nlp(text)
def test_attributeruler_serialize(nlp):
def test_attributeruler_serialize(nlp, pattern_dicts):
a = nlp.add_pipe("attribute_ruler")
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, index=0)
a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, index=1)
a.add(
[[{"ORTH": "a"}, {"ORTH": "test"}]],
{"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
index=0,
)
a.add(
[[{"ORTH": "This"}, {"ORTH": "is"}]],
{"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"},
index=1,
)
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
text = "This is a test."