From 90d88729e0cc022596c1df6339848476f004c3c2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 26 Aug 2020 15:39:30 +0200 Subject: [PATCH] Add AttributeRuler.score (#5963) * Add AttributeRuler.score Add scoring for TAG / POS / MORPH / LEMMA if these are present in the assigned token attributes. Add default score weights (that don't really make a lot of sense) so that the scores are in the default config in some form. * Update docs --- spacy/pipeline/attributeruler.py | 30 ++++++++++++++++++++- spacy/pipeline/tagger.pyx | 2 +- spacy/tests/pipeline/test_attributeruler.py | 18 +++++++++++++ website/docs/api/attributeruler.md | 15 +++++++++++ website/docs/api/tagger.md | 2 +- 5 files changed, 64 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index aba76664c..d93afc642 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -4,9 +4,11 @@ from pathlib import Path from .pipe import Pipe from ..errors import Errors +from ..gold import validate_examples from ..language import Language from ..matcher import Matcher -from ..symbols import IDS +from ..scorer import Scorer +from ..symbols import IDS, TAG, POS, MORPH, LEMMA from ..tokens import Doc, Span from ..tokens._retokenize import normalize_token_attrs, set_token_attrs from ..vocab import Vocab @@ -192,6 +194,32 @@ class AttributeRuler(Pipe): all_patterns.append(p) return all_patterns + def score(self, examples, **kwargs): + """Score a batch of examples. + + examples (Iterable[Example]): The examples to score. + RETURNS (Dict[str, Any]): The scores, produced by + Scorer.score_token_attr for the attributes "tag", "pos", "morph" + and "lemma" for the target token attributes. + + DOCS: https://spacy.io/api/tagger#score + """ + validate_examples(examples, "AttributeRuler.score") + results = {} + attrs = set() + for token_attrs in self.attrs: + attrs.update(token_attrs) + for attr in attrs: + if attr == TAG: + results.update(Scorer.score_token_attr(examples, "tag", **kwargs)) + elif attr == POS: + results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) + elif attr == MORPH: + results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) + elif attr == LEMMA: + results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) + return results + def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: """Serialize the AttributeRuler to a bytestring. diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 2a4274597..2277aaf75 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -307,7 +307,7 @@ class Tagger(Pipe): examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores, produced by - Scorer.score_token_attr for the attributes "tag", "pos" and "lemma". + Scorer.score_token_attr for the attributes "tag". DOCS: https://spacy.io/api/tagger#score """ diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index bcde7bf63..d9a492580 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -1,5 +1,6 @@ import pytest import numpy +from spacy.gold import Example from spacy.lang.en import English from spacy.pipeline import AttributeRuler from spacy import util, registry @@ -94,6 +95,23 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): assert doc[3].morph_ == "Case=Nom|Number=Sing" +def test_attributeruler_score(nlp, pattern_dicts): + # initialize with patterns + nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) + doc = nlp("This is a test.") + assert doc[2].lemma_ == "the" + assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert doc[3].lemma_ == "cat" + assert doc[3].morph_ == "Case=Nom|Number=Sing" + + dev_examples = [Example.from_dict(nlp.make_doc("This is a test."), {"lemmas": ["this", "is", "a", "cat", "."]})] + scores = nlp.evaluate(dev_examples) + # "cat" is the only correct lemma + assert scores["lemma_acc"] == pytest.approx(0.2) + # the empty morphs are correct + assert scores["morph_acc"] == pytest.approx(0.6) + + def test_attributeruler_tag_map(nlp, tag_map): a = AttributeRuler(nlp.vocab) a.load_from_tag_map(tag_map) diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index 98f267e87..413aab2d8 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -138,6 +138,21 @@ Get all patterns that have been added to the attribute ruler in the | ----------- | -------------------------------------------------------------------------------------------- | | **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ | +## AttributeRuler.score {#score tag="method" new="3"} + +Score a batch of examples. + +> #### Example +> +> ```python +> scores = attribute_ruler.score(examples) +> ``` + +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ | + ## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"} Load attribute ruler patterns from a tag map. diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index b255b2261..af0e3af3c 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -252,7 +252,7 @@ Score a batch of examples. | Name | Description | | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Dict[str, float]~~ | ## Tagger.create_optimizer {#create_optimizer tag="method"}