mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 01:32:32 +03:00
Extend AttributeRuler functionality
* Add option to initialize with a dict of AttributeRuler patterns * Instead of silently discarding overlapping matches (the default behavior for the retokenizer if only the attrs differ), split the matches into disjoint sets and retokenize each set separately. This allows, for instance, one pattern to set the POS and another pattern to set the lemma. (If two matches modify the same attribute, it looks like the attrs are applied in the order they were added, but it may not be deterministic?) * Improve types
This commit is contained in:
parent
352b918356
commit
ca33e891e2
|
@ -1,5 +1,5 @@
|
||||||
import srsly
|
import srsly
|
||||||
from typing import List, Dict, Union, Iterable
|
from typing import List, Dict, Union, Iterable, Any
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
|
@ -7,11 +7,14 @@ from ..errors import Errors
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..matcher import Matcher
|
from ..matcher import Matcher
|
||||||
from ..symbols import IDS
|
from ..symbols import IDS
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc, Span
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
MatcherPatternType = List[Dict[Union[int, str], Any]]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"attribute_ruler",
|
"attribute_ruler",
|
||||||
assigns=[],
|
assigns=[],
|
||||||
|
@ -20,9 +23,9 @@ from .. import util
|
||||||
default_score_weights={},
|
default_score_weights={},
|
||||||
)
|
)
|
||||||
def make_attribute_ruler(
|
def make_attribute_ruler(
|
||||||
nlp: Language, name: str,
|
nlp: Language, name: str, pattern_dicts: Iterable[Dict] = tuple()
|
||||||
):
|
):
|
||||||
return AttributeRuler(nlp.vocab, name)
|
return AttributeRuler(nlp.vocab, name, pattern_dicts=pattern_dicts)
|
||||||
|
|
||||||
|
|
||||||
class AttributeRuler(Pipe):
|
class AttributeRuler(Pipe):
|
||||||
|
@ -32,10 +35,22 @@ class AttributeRuler(Pipe):
|
||||||
DOCS: https://spacy.io/api/attributeruler
|
DOCS: https://spacy.io/api/attributeruler
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab: Vocab, name: str = "attribute_ruler") -> None:
|
def __init__(
|
||||||
"""Initialize the attributeruler.
|
self,
|
||||||
|
vocab: Vocab,
|
||||||
|
name: str = "attribute_ruler",
|
||||||
|
*,
|
||||||
|
pattern_dicts: List[Dict[str, Union[List[MatcherPatternType], Dict, int]]] = {},
|
||||||
|
) -> None:
|
||||||
|
"""Initialize the AttributeRuler.
|
||||||
|
|
||||||
RETURNS (AttributeRuler): The attributeruler component.
|
vocab (Vocab): The vocab.
|
||||||
|
name (str): The pipe name. Defaults to "attribute_ruler".
|
||||||
|
pattern_dicts (List[Dict]): A list of pattern dicts with the keys as
|
||||||
|
the arguments to AttributeRuler.add (`patterns`/`attrs`/`index`) to add
|
||||||
|
as patterns.
|
||||||
|
|
||||||
|
RETURNS (AttributeRuler): The AttributeRuler component.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/attributeruler#init
|
DOCS: https://spacy.io/api/attributeruler#init
|
||||||
"""
|
"""
|
||||||
|
@ -45,6 +60,9 @@ class AttributeRuler(Pipe):
|
||||||
self.attrs = []
|
self.attrs = []
|
||||||
self.indices = []
|
self.indices = []
|
||||||
|
|
||||||
|
for p in pattern_dicts:
|
||||||
|
self.add(**p)
|
||||||
|
|
||||||
def __call__(self, doc: Doc) -> Doc:
|
def __call__(self, doc: Doc) -> Doc:
|
||||||
"""Apply the attributeruler to a Doc and set all attribute exceptions.
|
"""Apply the attributeruler to a Doc and set all attribute exceptions.
|
||||||
|
|
||||||
|
@ -54,18 +72,33 @@ class AttributeRuler(Pipe):
|
||||||
DOCS: https://spacy.io/api/attributeruler#call
|
DOCS: https://spacy.io/api/attributeruler#call
|
||||||
"""
|
"""
|
||||||
matches = self.matcher(doc)
|
matches = self.matcher(doc)
|
||||||
|
|
||||||
|
# Multiple patterns may apply to the same token but the retokenizer can
|
||||||
|
# only handle one merge per token, so split the matches into sets of
|
||||||
|
# disjoint spans.
|
||||||
|
original_spans = set(
|
||||||
|
[Span(doc, start, end, label=match_id) for match_id, start, end in matches]
|
||||||
|
)
|
||||||
|
disjoint_span_sets = []
|
||||||
|
while original_spans:
|
||||||
|
filtered_spans = set(util.filter_spans(original_spans))
|
||||||
|
disjoint_span_sets.append(filtered_spans)
|
||||||
|
original_spans -= filtered_spans
|
||||||
|
|
||||||
|
# Retokenize with each set of disjoint spans separately
|
||||||
|
for span_set in disjoint_span_sets:
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for match_id, start, end in matches:
|
for span in span_set:
|
||||||
attrs = self.attrs[match_id]
|
attrs = self.attrs[span.label]
|
||||||
index = self.indices[match_id]
|
index = self.indices[span.label]
|
||||||
token = doc[start:end][index]
|
token = span[index]
|
||||||
if start <= token.i < end:
|
if span.start <= token.i < span.end:
|
||||||
retokenizer.merge(doc[token.i : token.i + 1], attrs)
|
retokenizer.merge(doc[token.i : token.i + 1], attrs)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E1001.format(
|
Errors.E1001.format(
|
||||||
patterns=self.matcher.get(match_id),
|
patterns=self.matcher.get(span.label),
|
||||||
span=[t.text for t in doc[start:end]],
|
span=[t.text for t in span],
|
||||||
index=index,
|
index=index,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -93,12 +126,14 @@ class AttributeRuler(Pipe):
|
||||||
attrs["MORPH"] = self.vocab.strings[morph]
|
attrs["MORPH"] = self.vocab.strings[morph]
|
||||||
self.add([pattern], attrs)
|
self.add([pattern], attrs)
|
||||||
|
|
||||||
def add(self, patterns: List[List[Dict]], attrs: Dict, index: int = 0) -> None:
|
def add(
|
||||||
|
self, patterns: Iterable[MatcherPatternType], attrs: Dict, index: int = 0
|
||||||
|
) -> None:
|
||||||
"""Add Matcher patterns for tokens that should be modified with the
|
"""Add Matcher patterns for tokens that should be modified with the
|
||||||
provided attributes. The token at the specified index within the
|
provided attributes. The token at the specified index within the
|
||||||
matched span will be assigned the attributes.
|
matched span will be assigned the attributes.
|
||||||
|
|
||||||
pattern (List[List[Dict]]): A list of Matcher patterns.
|
pattern (Iterable[List[Dict]]): A list of Matcher patterns.
|
||||||
attrs (Dict): The attributes to assign to the target token in the
|
attrs (Dict): The attributes to assign to the target token in the
|
||||||
matched span.
|
matched span.
|
||||||
index (int): The index of the token in the matched span to modify. May
|
index (int): The index of the token in the matched span to modify. May
|
||||||
|
|
|
@ -12,6 +12,24 @@ def nlp():
|
||||||
return English()
|
return English()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def pattern_dicts():
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"patterns": [[{"ORTH": "a"}]],
|
||||||
|
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
||||||
|
},
|
||||||
|
# one pattern sets the lemma
|
||||||
|
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
||||||
|
# another pattern sets the morphology
|
||||||
|
{
|
||||||
|
"patterns": [[{"ORTH": "test"}]],
|
||||||
|
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
||||||
|
"index": 0,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tag_map():
|
def tag_map():
|
||||||
return {
|
return {
|
||||||
|
@ -25,13 +43,21 @@ def morph_rules():
|
||||||
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
|
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_init(nlp):
|
def test_attributeruler_init(nlp, pattern_dicts):
|
||||||
a = AttributeRuler(nlp.vocab)
|
|
||||||
|
|
||||||
a = nlp.add_pipe("attribute_ruler")
|
a = nlp.add_pipe("attribute_ruler")
|
||||||
a.add([[{"ORTH": "a"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"})
|
for p in pattern_dicts:
|
||||||
a.add([[{"ORTH": "test"}]], {"LEMMA": "cat", "MORPH": "Number=Sing|Case=Nom"})
|
a.add(**p)
|
||||||
a.add([[{"ORTH": "test"}]], {"LEMMA": "dog"})
|
|
||||||
|
doc = nlp("This is a test.")
|
||||||
|
assert doc[2].lemma_ == "the"
|
||||||
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||||
|
assert doc[3].lemma_ == "cat"
|
||||||
|
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||||
|
|
||||||
|
|
||||||
|
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
|
# initialize with patterns
|
||||||
|
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||||
|
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
|
@ -43,7 +69,11 @@ def test_attributeruler_init(nlp):
|
||||||
def test_attributeruler_tag_map(nlp, tag_map):
|
def test_attributeruler_tag_map(nlp, tag_map):
|
||||||
a = AttributeRuler(nlp.vocab)
|
a = AttributeRuler(nlp.vocab)
|
||||||
a.load_from_tag_map(tag_map)
|
a.load_from_tag_map(tag_map)
|
||||||
doc = get_doc(nlp.vocab, words=["This", "is", "a", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])
|
doc = get_doc(
|
||||||
|
nlp.vocab,
|
||||||
|
words=["This", "is", "a", "test", "."],
|
||||||
|
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||||
|
)
|
||||||
doc = a(doc)
|
doc = a(doc)
|
||||||
|
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
|
@ -58,7 +88,11 @@ def test_attributeruler_tag_map(nlp, tag_map):
|
||||||
def test_attributeruler_morph_rules(nlp, morph_rules):
|
def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||||
a = AttributeRuler(nlp.vocab)
|
a = AttributeRuler(nlp.vocab)
|
||||||
a.load_from_morph_rules(morph_rules)
|
a.load_from_morph_rules(morph_rules)
|
||||||
doc = get_doc(nlp.vocab, words=["This", "is", "the", "test", "."], tags=["DT", "VBZ", "DT", "NN", "."])
|
doc = get_doc(
|
||||||
|
nlp.vocab,
|
||||||
|
words=["This", "is", "the", "test", "."],
|
||||||
|
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||||
|
)
|
||||||
doc = a(doc)
|
doc = a(doc)
|
||||||
|
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
|
@ -73,8 +107,16 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||||
|
|
||||||
def test_attributeruler_indices(nlp):
|
def test_attributeruler_indices(nlp):
|
||||||
a = nlp.add_pipe("attribute_ruler")
|
a = nlp.add_pipe("attribute_ruler")
|
||||||
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, index=0)
|
a.add(
|
||||||
a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, index=1)
|
[[{"ORTH": "a"}, {"ORTH": "test"}]],
|
||||||
|
{"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
||||||
|
index=0,
|
||||||
|
)
|
||||||
|
a.add(
|
||||||
|
[[{"ORTH": "This"}, {"ORTH": "is"}]],
|
||||||
|
{"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"},
|
||||||
|
index=1,
|
||||||
|
)
|
||||||
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
|
||||||
|
|
||||||
text = "This is a test."
|
text = "This is a test."
|
||||||
|
@ -97,10 +139,19 @@ def test_attributeruler_indices(nlp):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
|
|
||||||
def test_attributeruler_serialize(nlp):
|
|
||||||
|
def test_attributeruler_serialize(nlp, pattern_dicts):
|
||||||
a = nlp.add_pipe("attribute_ruler")
|
a = nlp.add_pipe("attribute_ruler")
|
||||||
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, index=0)
|
a.add(
|
||||||
a.add([[{"ORTH": "This"}, {"ORTH": "is"}]], {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"}, index=1)
|
[[{"ORTH": "a"}, {"ORTH": "test"}]],
|
||||||
|
{"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
||||||
|
index=0,
|
||||||
|
)
|
||||||
|
a.add(
|
||||||
|
[[{"ORTH": "This"}, {"ORTH": "is"}]],
|
||||||
|
{"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"},
|
||||||
|
index=1,
|
||||||
|
)
|
||||||
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
|
||||||
|
|
||||||
text = "This is a test."
|
text = "This is a test."
|
||||||
|
|
Loading…
Reference in New Issue
Block a user