Refactor scoring methods to use registered functions (#8766)

* Add scorer option to components

Add an optional `scorer` parameter to all pipeline components. If a
scoring function is provided, it overrides the default scoring method
for that component.

* Add registered scorers for all components

* Add `scorers` registry
* Move all scoring methods outside of components as independent
  functions and register
* Use the registered scoring methods as defaults in configs and inits

Additional:

* The scoring methods no longer have access to the full component, so
  use settings from `cfg` as default scorer options to handle settings
  such as `labels`, `threshold`, and `positive_label`
* The `attribute_ruler` scoring method no longer has access to the
  patterns, so all scoring methods are called
* Bug fix: `spancat` scoring method is updated to set `allow_overlap` to
  score overlapping spans correctly

* Update Russian lemmatizer to use direct score method

* Check type of cfg in Pipe.score

* Fix check

* Update spacy/pipeline/sentencizer.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove validate_examples from scoring functions

* Use Pipe.labels instead of Pipe.cfg["labels"]

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Adriane Boyd 2021-08-10 15:13:39 +02:00 committed by GitHub
parent ee011ca963
commit f99d6d5e39
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
36 changed files with 638 additions and 363 deletions

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
@ -23,13 +23,25 @@ class Bengali(Language):
@Bengali.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Bengali"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
@ -28,13 +28,25 @@ class Catalan(Language):
@Catalan.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return CatalanLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Catalan"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -28,13 +28,25 @@ class Greek(Language):
@Greek.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return GreekLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Greek"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -26,13 +26,25 @@ class English(Language):
@English.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return EnglishLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["English"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
@ -26,13 +26,25 @@ class Spanish(Language):
@Spanish.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return SpanishLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Spanish"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
@ -26,13 +26,25 @@ class Persian(Language):
@Persian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Persian"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
@ -31,13 +31,25 @@ class French(Language):
@French.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return FrenchLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["French"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from .stop_words import STOP_WORDS
@ -23,13 +23,25 @@ class Italian(Language):
@Italian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
default_config={
"model": None,
"mode": "pos_lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return ItalianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Italian"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from .lemmatizer import MacedonianLemmatizer
from .stop_words import STOP_WORDS
@ -38,13 +38,25 @@ class Macedonian(Language):
@Macedonian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return MacedonianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Macedonian"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -26,13 +26,25 @@ class Norwegian(Language):
@Norwegian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Norwegian"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
@ -30,13 +30,25 @@ class Dutch(Language):
@Dutch.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return DutchLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Dutch"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
@ -33,13 +33,25 @@ class Polish(Language):
@Polish.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
default_config={
"model": None,
"mode": "pos_lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return PolishLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Polish"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from .stop_words import STOP_WORDS
@ -22,7 +22,12 @@ class Russian(Language):
@Russian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
default_config={
"model": None,
"mode": "pymorphy2",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
@ -31,8 +36,11 @@ def make_lemmatizer(
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return RussianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Russian"]

View File

@ -1,8 +1,9 @@
from typing import Optional, List, Dict, Tuple
from typing import Optional, List, Dict, Tuple, Callable
from thinc.api import Model
from ...pipeline import Lemmatizer
from ...pipeline.lemmatizer import lemmatizer_score
from ...symbols import POS
from ...tokens import Token
from ...vocab import Vocab
@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
*,
mode: str = "pymorphy2",
overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None:
if mode == "pymorphy2":
try:
@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer):
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer()
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
string = token.text

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
@ -29,13 +29,25 @@ class Swedish(Language):
@Swedish.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Swedish"]

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
@ -23,13 +23,25 @@ class Ukrainian(Language):
@Ukrainian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
default_config={
"model": None,
"mode": "pymorphy2",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return UkrainianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Ukrainian"]

View File

@ -1,8 +1,9 @@
from typing import Optional
from typing import Optional, Callable
from thinc.api import Model
from ..ru.lemmatizer import RussianLemmatizer
from ...pipeline.lemmatizer import lemmatizer_score
from ...vocab import Vocab
@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
*,
mode: str = "pymorphy2",
overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None:
if mode == "pymorphy2":
try:
@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer):
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk")
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)

View File

@ -5,15 +5,15 @@ from pathlib import Path
from .pipe import Pipe
from ..errors import Errors
from ..training import validate_examples, Example
from ..training import Example
from ..language import Language
from ..matcher import Matcher
from ..scorer import Scorer
from ..symbols import IDS, TAG, POS, MORPH, LEMMA
from ..symbols import IDS
from ..tokens import Doc, Span
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
from ..vocab import Vocab
from ..util import SimpleFrozenList
from ..util import SimpleFrozenList, registry
from .. import util
@ -23,9 +23,43 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
@Language.factory("attribute_ruler", default_config={"validate": False})
def make_attribute_ruler(nlp: Language, name: str, validate: bool):
return AttributeRuler(nlp.vocab, name, validate=validate)
@Language.factory(
"attribute_ruler",
default_config={
"validate": False,
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
},
)
def make_attribute_ruler(
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
):
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
def attribute_ruler_score(
examples: Iterable[Example], **kwargs
) -> Dict[str, Any]:
def morph_key_getter(token, attr):
return getattr(token, attr).key
results = {}
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(
Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
)
results.update(
Scorer.score_token_attr_per_feat(
examples, "morph", getter=morph_key_getter, **kwargs
)
)
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results
@registry.scorers("spacy.attribute_ruler_scorer.v1")
def make_attribute_ruler_scorer():
return attribute_ruler_score
class AttributeRuler(Pipe):
@ -36,7 +70,12 @@ class AttributeRuler(Pipe):
"""
def __init__(
self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
self,
vocab: Vocab,
name: str = "attribute_ruler",
*,
validate: bool = False,
scorer: Optional[Callable] = attribute_ruler_score,
) -> None:
"""Create the AttributeRuler. After creation, you can add patterns
with the `.initialize()` or `.add_patterns()` methods, or load patterns
@ -57,6 +96,7 @@ class AttributeRuler(Pipe):
self.attrs = []
self._attrs_unnormed = [] # store for reference
self.indices = []
self.scorer = scorer
def clear(self) -> None:
"""Reset all patterns."""
@ -228,45 +268,6 @@ class AttributeRuler(Pipe):
all_patterns.append(p)
return all_patterns
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
and "lemma" for the target token attributes.
DOCS: https://spacy.io/api/tagger#score
"""
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "AttributeRuler.score")
results = {}
attrs = set()
for token_attrs in self.attrs:
attrs.update(token_attrs)
for attr in attrs:
if attr == TAG:
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
elif attr == POS:
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
elif attr == MORPH:
results.update(
Scorer.score_token_attr(
examples, "morph", getter=morph_key_getter, **kwargs
)
)
results.update(
Scorer.score_token_attr_per_feat(
examples, "morph", getter=morph_key_getter, **kwargs
)
)
elif attr == LEMMA:
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the AttributeRuler to a bytestring.

View File

@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True
from collections import defaultdict
from typing import Optional, Iterable
from typing import Optional, Iterable, Callable
from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem
@ -12,7 +12,7 @@ from ..language import Language
from ._parser_internals import nonproj
from ._parser_internals.nonproj import DELIMITER
from ..scorer import Scorer
from ..training import validate_examples
from ..util import registry
default_model_config = """
@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
@ -63,7 +64,8 @@ def make_parser(
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int
min_action_freq: int,
scorer: Optional[Callable],
):
"""Create a transition-based DependencyParser component. The dependency parser
jointly learns sentence segmentation and labelled dependency parsing, and can
@ -115,7 +117,8 @@ def make_parser(
beam_update_prob=0.0,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None
incorrect_spans_key=None,
scorer=scorer,
)
@Language.factory(
@ -130,6 +133,7 @@ def make_parser(
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
@ -151,6 +155,7 @@ def make_beam_parser(
beam_width: int,
beam_density: float,
beam_update_prob: float,
scorer: Optional[Callable],
):
"""Create a transition-based DependencyParser component that uses beam-search.
The dependency parser jointly learns sentence segmentation and labelled
@ -207,10 +212,41 @@ def make_beam_parser(
min_action_freq=min_action_freq,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None
incorrect_spans_key=None,
scorer=scorer,
)
def parser_score(examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
and Scorer.score_deps.
DOCS: https://spacy.io/api/dependencyparser#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
def dep_getter(token, attr):
dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower()
return dep
results = {}
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs))
del results["sents_per_type"]
return results
@registry.scorers("spacy.parser_scorer.v1")
def make_parser_scorer():
return parser_score
cdef class DependencyParser(Parser):
"""Pipeline component for dependency parsing.
@ -233,6 +269,7 @@ cdef class DependencyParser(Parser):
beam_update_prob=0.0,
multitasks=tuple(),
incorrect_spans_key=None,
scorer=parser_score,
):
"""Create a DependencyParser.
"""
@ -249,6 +286,7 @@ cdef class DependencyParser(Parser):
beam_update_prob=beam_update_prob,
multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
)
@property
@ -281,31 +319,6 @@ cdef class DependencyParser(Parser):
labels.add(label)
return tuple(sorted(labels))
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
and Scorer.score_deps.
DOCS: https://spacy.io/api/dependencyparser#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "DependencyParser.score")
def dep_getter(token, attr):
dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower()
return dep
results = {}
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs))
del results["sents_per_type"]
return results
def scored_parses(self, beams):
"""Return two dictionaries with scores for each beam/doc that was processed:
one containing (i, head) keys, and another containing (i, label) keys.

View File

@ -16,7 +16,7 @@ from ..language import Language
from ..vocab import Vocab
from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors, Warnings
from ..util import SimpleFrozenList
from ..util import SimpleFrozenList, registry
from .. import util
from ..scorer import Scorer
@ -50,6 +50,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"incl_context": True,
"entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
},
default_score_weights={
"nel_micro_f": 1.0,
@ -68,6 +69,7 @@ def make_entity_linker(
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
scorer: Optional[Callable],
):
"""Construct an EntityLinker component.
@ -92,9 +94,19 @@ def make_entity_linker(
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
scorer=scorer,
)
def entity_linker_score(examples, **kwargs):
return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
@registry.scorers("spacy.entity_linker_scorer.v1")
def make_entity_linker_scorer():
return entity_linker_score
class EntityLinker(TrainablePipe):
"""Pipeline component for named entity linking.
@ -115,6 +127,7 @@ class EntityLinker(TrainablePipe):
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
scorer: Optional[Callable] = entity_linker_score,
) -> None:
"""Initialize an entity linker.
@ -145,6 +158,7 @@ class EntityLinker(TrainablePipe):
# how many neighbour sentences to take into account
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
self.kb = empty_kb(entity_vector_length)(self.vocab)
self.scorer = scorer
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will
@ -389,17 +403,6 @@ class EntityLinker(TrainablePipe):
for token in ent:
token.ent_kb_id_ = kb_id
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS TODO: https://spacy.io/api/entity_linker#score
"""
validate_examples(examples, "EntityLinker.score")
return Scorer.score_links(examples, negative_labels=[self.NIL])
def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring.

View File

@ -8,11 +8,10 @@ from .pipe import Pipe
from ..training import Example
from ..language import Language
from ..errors import Errors, Warnings
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher
from ..scorer import get_ner_prf
from ..training import validate_examples
DEFAULT_ENT_ID_SEP = "||"
@ -27,6 +26,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
"validate": False,
"overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
@ -42,6 +42,7 @@ def make_entity_ruler(
validate: bool,
overwrite_ents: bool,
ent_id_sep: str,
scorer: Optional[Callable],
):
return EntityRuler(
nlp,
@ -50,9 +51,19 @@ def make_entity_ruler(
validate=validate,
overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep,
scorer=scorer,
)
def entity_ruler_score(examples, **kwargs):
return get_ner_prf(examples)
@registry.scorers("spacy.entity_ruler_scorer.v1")
def make_entity_ruler_scorer():
return entity_ruler_score
class EntityRuler(Pipe):
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
rules or exact phrase matches. It can be combined with the statistical
@ -74,6 +85,7 @@ class EntityRuler(Pipe):
overwrite_ents: bool = False,
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
patterns: Optional[List[PatternType]] = None,
scorer: Optional[Callable] = entity_ruler_score,
) -> None:
"""Initialize the entity ruler. If patterns are supplied here, they
need to be a list of dictionaries with a `"label"` and `"pattern"`
@ -112,6 +124,7 @@ class EntityRuler(Pipe):
self._ent_ids = defaultdict(dict)
if patterns is not None:
self.add_patterns(patterns)
self.scorer = scorer
def __len__(self) -> int:
"""The number of all patterns added to the entity ruler."""
@ -358,10 +371,6 @@ class EntityRuler(Pipe):
label = f"{label}{self.ent_id_sep}{ent_id}"
return label
def score(self, examples, **kwargs):
validate_examples(examples, "EntityRuler.score")
return get_ner_prf(examples)
def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler":

View File

@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
from ..scorer import Scorer
from ..tokens import Doc, Token
from ..vocab import Vocab
from ..training import validate_examples
from ..util import logger, SimpleFrozenList
from ..util import logger, SimpleFrozenList, registry
from .. import util
@Language.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "lookup", "overwrite": False},
default_config={
"model": None,
"mode": "lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_token_attr(examples, "lemma", **kwargs)
@registry.scorers("spacy.lemmatizer_scorer.v1")
def make_lemmatizer_scorer():
return lemmatizer_score
class Lemmatizer(Pipe):
@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
*,
mode: str = "lookup",
overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None:
"""Initialize a Lemmatizer.
@ -89,6 +110,7 @@ class Lemmatizer(Pipe):
raise ValueError(Errors.E1003.format(mode=mode))
self.lemmatize = getattr(self, mode_attr)
self.cache = {}
self.scorer = scorer
@property
def mode(self):
@ -247,17 +269,6 @@ class Lemmatizer(Pipe):
"""
return False
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS: https://spacy.io/api/lemmatizer#score
"""
validate_examples(examples, "Lemmatizer.score")
return Scorer.score_token_attr(examples, "lemma", **kwargs)
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
):

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True, binding=True
from typing import Optional, Union, Dict
from typing import Optional, Union, Dict, Callable
import srsly
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from itertools import islice
@ -17,6 +17,7 @@ from .tagger import Tagger
from .. import util
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
from ..util import registry
default_model_config = """
@ -48,15 +49,33 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"morphologizer",
assigns=["token.morph", "token.pos"],
default_config={"model": DEFAULT_MORPH_MODEL},
default_config={"model": DEFAULT_MORPH_MODEL, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
)
def make_morphologizer(
nlp: Language,
model: Model,
name: str,
scorer: Optional[Callable],
):
return Morphologizer(nlp.vocab, model, name)
return Morphologizer(nlp.vocab, model, name, scorer=scorer)
def morphologizer_score(examples, **kwargs):
def morph_key_getter(token, attr):
return getattr(token, attr).key
results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples,
"morph", getter=morph_key_getter, **kwargs))
return results
@registry.scorers("spacy.morphologizer_scorer.v1")
def make_morphologizer_scorer():
return morphologizer_score
class Morphologizer(Tagger):
@ -67,6 +86,8 @@ class Morphologizer(Tagger):
vocab: Vocab,
model: Model,
name: str = "morphologizer",
*,
scorer: Optional[Callable] = morphologizer_score,
):
"""Initialize a morphologizer.
@ -87,6 +108,7 @@ class Morphologizer(Tagger):
# 2) labels_pos stores a mapping from morph+POS->POS
cfg = {"labels_morph": {}, "labels_pos": {}}
self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
@property
def labels(self):
@ -246,24 +268,3 @@ class Morphologizer(Tagger):
if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by
Scorer.score_token_attr for the attributes "pos" and "morph" and
Scorer.score_token_attr_per_feat for the attribute "morph".
DOCS: https://spacy.io/api/morphologizer#score
"""
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "Morphologizer.score")
results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples,
"morph", getter=morph_key_getter, **kwargs))
return results

View File

@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True
from collections import defaultdict
from typing import Optional, Iterable
from typing import Optional, Iterable, Callable
from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem
@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown
from ..language import Language
from ..scorer import get_ner_prf, PRFScore
from ..training import validate_examples
from ..util import registry
default_model_config = """
@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"incorrect_spans_key": None
"incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
@ -52,7 +53,8 @@ def make_ner(
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
incorrect_spans_key: Optional[str]=None
incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
):
"""Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens.
@ -92,6 +94,7 @@ def make_ner(
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
scorer=scorer,
)
@Language.factory(
@ -104,7 +107,8 @@ def make_ner(
"beam_density": 0.01,
"beam_update_prob": 0.5,
"beam_width": 32,
"incorrect_spans_key": None
"incorrect_spans_key": None,
"scorer": None,
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
)
@ -117,7 +121,8 @@ def make_beam_ner(
beam_width: int,
beam_density: float,
beam_update_prob: float,
incorrect_spans_key: Optional[str]=None
incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
):
"""Create a transition-based EntityRecognizer component that uses beam-search.
The entity recognizer identifies non-overlapping labelled spans of tokens.
@ -164,10 +169,20 @@ def make_beam_ner(
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
incorrect_spans_key=incorrect_spans_key
incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
)
def ner_score(examples, **kwargs):
return get_ner_prf(examples, **kwargs)
@registry.scorers("spacy.ner_scorer.v1")
def make_ner_scorer():
return ner_score
cdef class EntityRecognizer(Parser):
"""Pipeline component for named entity recognition.
@ -188,6 +203,7 @@ cdef class EntityRecognizer(Parser):
beam_update_prob=0.0,
multitasks=tuple(),
incorrect_spans_key=None,
scorer=ner_score,
):
"""Create an EntityRecognizer.
"""
@ -204,6 +220,7 @@ cdef class EntityRecognizer(Parser):
beam_update_prob=beam_update_prob,
multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
)
def add_multitask_objective(self, mt_component):
@ -227,17 +244,6 @@ cdef class EntityRecognizer(Parser):
if move[0] in ("B", "I", "L", "U"))
return tuple(sorted(labels))
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
DOCS: https://spacy.io/api/entityrecognizer#score
"""
validate_examples(examples, "EntityRecognizer.score")
return get_ner_prf(examples)
def scored_ents(self, beams):
"""Return a dictionary of (start, end, label) tuples with corresponding scores
for each beam/doc that was processed.

View File

@ -81,6 +81,17 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe#score
"""
if hasattr(self, "scorer") and self.scorer is not None:
scorer_kwargs = {}
# use default settings from cfg (e.g., threshold)
if hasattr(self, "cfg") and isinstance(self.cfg, dict):
scorer_kwargs.update(self.cfg)
# override self.cfg["labels"] with self.labels
if hasattr(self, "labels"):
scorer_kwargs["labels"] = self.labels
# override with kwargs settings
scorer_kwargs.update(kwargs)
return self.scorer(examples, **scorer_kwargs)
return {}
@property

View File

@ -1,26 +1,29 @@
# cython: infer_types=True, profile=True, binding=True
from typing import Optional, List
from typing import Optional, List, Callable
import srsly
from ..tokens.doc cimport Doc
from .pipe import Pipe
from .senter import senter_score
from ..language import Language
from ..scorer import Scorer
from ..training import validate_examples
from .. import util
@Language.factory(
"sentencizer",
assigns=["token.is_sent_start", "doc.sents"],
default_config={"punct_chars": None},
default_config={"punct_chars": None, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_sentencizer(
nlp: Language,
name: str,
punct_chars: Optional[List[str]]
punct_chars: Optional[List[str]],
scorer: Optional[Callable],
):
return Sentencizer(name, punct_chars=punct_chars)
return Sentencizer(name, punct_chars=punct_chars, scorer=scorer)
class Sentencizer(Pipe):
@ -41,7 +44,13 @@ class Sentencizer(Pipe):
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
'', '']
def __init__(self, name="sentencizer", *, punct_chars=None):
def __init__(
self,
name="sentencizer",
*,
punct_chars=None,
scorer=senter_score,
):
"""Initialize the sentencizer.
punct_chars (list): Punctuation characters to split on. Will be
@ -55,6 +64,7 @@ class Sentencizer(Pipe):
self.punct_chars = set(punct_chars)
else:
self.punct_chars = set(self.default_punct_chars)
self.scorer = scorer
def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
@ -122,22 +132,6 @@ class Sentencizer(Pipe):
else:
doc.c[j].sent_start = -1
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
DOCS: https://spacy.io/api/sentencizer#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "Sentencizer.score")
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results
def to_bytes(self, *, exclude=tuple()):
"""Serialize the sentencizer to a bytestring.

View File

@ -1,5 +1,6 @@
# cython: infer_types=True, profile=True, binding=True
from itertools import islice
from typing import Optional, Callable
import srsly
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
@ -11,6 +12,7 @@ from ..language import Language
from ..errors import Errors
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
from ..util import registry
from .. import util
@ -34,11 +36,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"senter",
assigns=["token.is_sent_start"],
default_config={"model": DEFAULT_SENTER_MODEL},
default_config={"model": DEFAULT_SENTER_MODEL, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_senter(nlp: Language, name: str, model: Model):
return SentenceRecognizer(nlp.vocab, model, name)
def make_senter(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
return SentenceRecognizer(nlp.vocab, model, name, scorer=scorer)
def senter_score(examples, **kwargs):
def has_sents(doc):
return doc.has_annotation("SENT_START")
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results
@registry.scorers("spacy.senter_scorer.v1")
def make_senter_scorer():
return senter_score
class SentenceRecognizer(Tagger):
@ -46,7 +62,7 @@ class SentenceRecognizer(Tagger):
DOCS: https://spacy.io/api/sentencerecognizer
"""
def __init__(self, vocab, model, name="senter"):
def __init__(self, vocab, model, name="senter", *, scorer=senter_score):
"""Initialize a sentence recognizer.
vocab (Vocab): The shared vocabulary.
@ -61,6 +77,7 @@ class SentenceRecognizer(Tagger):
self.name = name
self._rehearsal_model = None
self.cfg = {}
self.scorer = scorer
@property
def labels(self):
@ -153,18 +170,3 @@ class SentenceRecognizer(Tagger):
def add_label(self, label, values=None):
raise NotImplementedError
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
DOCS: https://spacy.io/api/sentencerecognizer#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "SentenceRecognizer.score")
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results

View File

@ -98,6 +98,7 @@ def build_ngram_range_suggester(
"max_positive": None,
"model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)
@ -107,8 +108,9 @@ def make_spancat(
suggester: Callable[[List[Doc]], Ragged],
model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str,
threshold: float = 0.5,
max_positive: Optional[int] = None,
scorer: Optional[Callable],
threshold: float,
max_positive: Optional[int],
) -> "SpanCategorizer":
"""Create a SpanCategorizer component. The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller
@ -138,9 +140,28 @@ def make_spancat(
threshold=threshold,
max_positive=max_positive,
name=name,
scorer=scorer,
)
def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
kwargs = dict(kwargs)
attr_prefix = "spans_"
key = kwargs["spans_key"]
kwargs.setdefault("attr", f"{attr_prefix}{key}")
kwargs.setdefault("allow_overlap", True)
kwargs.setdefault(
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
)
kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
return Scorer.score_spans(examples, **kwargs)
@registry.scorers("spacy.spancat_scorer.v1")
def make_spancat_scorer():
return spancat_score
class SpanCategorizer(TrainablePipe):
"""Pipeline component to label spans of text.
@ -157,6 +178,7 @@ class SpanCategorizer(TrainablePipe):
spans_key: str = "spans",
threshold: float = 0.5,
max_positive: Optional[int] = None,
scorer: Optional[Callable] = spancat_score,
) -> None:
"""Initialize the span categorizer.
@ -172,6 +194,7 @@ class SpanCategorizer(TrainablePipe):
self.suggester = suggester
self.model = model
self.name = name
self.scorer = scorer
@property
def key(self) -> str:
@ -373,28 +396,6 @@ class SpanCategorizer(TrainablePipe):
else:
self.model.initialize()
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
DOCS: https://spacy.io/api/spancategorizer#score
"""
validate_examples(examples, "SpanCategorizer.score")
self._validate_categories(examples)
kwargs = dict(kwargs)
attr_prefix = "spans_"
kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
kwargs.setdefault("labels", self.labels)
kwargs.setdefault("multi_label", True)
kwargs.setdefault("threshold", self.cfg["threshold"])
kwargs.setdefault(
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
)
kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
return Scorer.score_spans(examples, **kwargs)
def _validate_categories(self, examples):
# TODO
pass

View File

@ -1,4 +1,5 @@
# cython: infer_types=True, profile=True, binding=True
from typing import Callable, Optional
import numpy
import srsly
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
@ -18,6 +19,7 @@ from ..parts_of_speech import X
from ..errors import Errors, Warnings
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
from ..util import registry
from .. import util
@ -41,10 +43,10 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"tagger",
assigns=["token.tag"],
default_config={"model": DEFAULT_TAGGER_MODEL},
default_config={"model": DEFAULT_TAGGER_MODEL, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
default_score_weights={"tag_acc": 1.0},
)
def make_tagger(nlp: Language, name: str, model: Model):
def make_tagger(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
"""Construct a part-of-speech tagger component.
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
@ -52,7 +54,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
in size, and be normalized as probabilities (all scores between 0 and 1,
with the rows summing to 1).
"""
return Tagger(nlp.vocab, model, name)
return Tagger(nlp.vocab, model, name, scorer=scorer)
def tagger_score(examples, **kwargs):
return Scorer.score_token_attr(examples, "tag", **kwargs)
@registry.scorers("spacy.tagger_scorer.v1")
def make_tagger_scorer():
return tagger_score
class Tagger(TrainablePipe):
@ -60,7 +71,7 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger
"""
def __init__(self, vocab, model, name="tagger"):
def __init__(self, vocab, model, name="tagger", *, scorer=tagger_score):
"""Initialize a part-of-speech tagger.
vocab (Vocab): The shared vocabulary.
@ -76,6 +87,7 @@ class Tagger(TrainablePipe):
self._rehearsal_model = None
cfg = {"labels": []}
self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
@property
def labels(self):
@ -289,15 +301,3 @@ class Tagger(TrainablePipe):
self.cfg["labels"].append(label)
self.vocab.strings.add(label)
return 1
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by
Scorer.score_token_attr for the attributes "tag".
DOCS: https://spacy.io/api/tagger#score
"""
validate_examples(examples, "Tagger.score")
return Scorer.score_token_attr(examples, "tag", **kwargs)

View File

@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors
from ..scorer import Scorer
from ..tokens import Doc
from ..util import registry
from ..vocab import Vocab
@ -70,7 +71,11 @@ subword_features = true
@Language.factory(
"textcat",
assigns=["doc.cats"],
default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
default_config={
"threshold": 0.5,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
@ -86,7 +91,11 @@ subword_features = true
},
)
def make_textcat(
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
@ -96,7 +105,21 @@ def make_textcat(
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
"""
return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_cats(
examples,
"cats",
multi_label=False,
**kwargs,
)
@registry.scorers("spacy.textcat_scorer.v1")
def make_textcat_scorer():
return textcat_score
class TextCategorizer(TrainablePipe):
@ -106,7 +129,13 @@ class TextCategorizer(TrainablePipe):
"""
def __init__(
self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
self,
vocab: Vocab,
model: Model,
name: str = "textcat",
*,
threshold: float,
scorer: Optional[Callable] = textcat_score,
) -> None:
"""Initialize a text categorizer for single-label classification.
@ -124,6 +153,7 @@ class TextCategorizer(TrainablePipe):
self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
self.cfg = dict(cfg)
self.scorer = scorer
@property
def labels(self) -> Tuple[str]:
@ -354,26 +384,6 @@ class TextCategorizer(TrainablePipe):
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
DOCS: https://spacy.io/api/textcategorizer#score
"""
validate_examples(examples, "TextCategorizer.score")
self._validate_categories(examples)
kwargs.setdefault("threshold", self.cfg["threshold"])
kwargs.setdefault("positive_label", self.cfg["positive_label"])
return Scorer.score_cats(
examples,
"cats",
labels=self.labels,
multi_label=False,
**kwargs,
)
def _validate_categories(self, examples: List[Example]):
"""Check whether the provided examples all have single-label cats annotations."""
for ex in examples:

View File

@ -5,10 +5,11 @@ from thinc.api import Model, Config
from thinc.types import Floats2d
from ..language import Language
from ..training import Example, validate_examples, validate_get_examples
from ..training import Example, validate_get_examples
from ..errors import Errors
from ..scorer import Scorer
from ..tokens import Doc
from ..util import registry
from ..vocab import Vocab
from .textcat import TextCategorizer
@ -70,7 +71,11 @@ subword_features = true
@Language.factory(
"textcat_multilabel",
assigns=["doc.cats"],
default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
default_config={
"threshold": 0.5,
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
@ -86,7 +91,11 @@ subword_features = true
},
)
def make_multilabel_textcat(
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
@ -97,7 +106,23 @@ def make_multilabel_textcat(
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
"""
return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
return MultiLabel_TextCategorizer(
nlp.vocab, model, name, threshold=threshold, scorer=scorer
)
def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_cats(
examples,
"cats",
multi_label=True,
**kwargs,
)
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
def make_textcat_multilabel_scorer():
return textcat_multilabel_score
class MultiLabel_TextCategorizer(TextCategorizer):
@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
name: str = "textcat_multilabel",
*,
threshold: float,
scorer: Optional[Callable] = textcat_multilabel_score,
) -> None:
"""Initialize a text categorizer for multi-label classification.
@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold}
self.cfg = dict(cfg)
self.scorer = scorer
def initialize(
self,
@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer):
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
DOCS: https://spacy.io/api/textcategorizer#score
"""
validate_examples(examples, "MultiLabel_TextCategorizer.score")
kwargs.setdefault("threshold", self.cfg["threshold"])
return Scorer.score_cats(
examples,
"cats",
labels=self.labels,
multi_label=True,
**kwargs,
)
def _validate_categories(self, examples: List[Example]):
"""This component allows any type of single- or multi-label annotations.
This method overwrites the more strict one from 'textcat'."""

View File

@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
cdef public Vocab vocab
cdef public object model
cdef public object cfg
cdef public object scorer

View File

@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
beam_density=0.0,
beam_update_prob=0.0,
multitasks=tuple(),
incorrect_spans_key=None
incorrect_spans_key=None,
scorer=None,
):
"""Create a Parser.
@ -117,6 +118,7 @@ cdef class Parser(TrainablePipe):
self.add_multitask_objective(multitask)
self._rehearsal_model = None
self.scorer = scorer
def __getnewargs_ex__(self):
"""This allows pickling the Parser and its keyword-only init arguments"""

View File

@ -537,7 +537,7 @@ class Scorer:
@staticmethod
def score_links(
examples: Iterable[Example], *, negative_labels: Iterable[str]
examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
) -> Dict[str, Any]:
"""Returns PRF for predicted links on the entity level.
To disentangle the performance of the NEL from the NER,
@ -711,7 +711,7 @@ class Scorer:
}
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
score_per_type = defaultdict(PRFScore)
for eg in examples:

View File

@ -32,24 +32,6 @@ def pattern_dicts():
]
@registry.misc("attribute_ruler_patterns")
def attribute_ruler_patterns():
return [
{
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
},
# one pattern sets the lemma
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
# another pattern sets the morphology
{
"patterns": [[{"ORTH": "test"}]],
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
"index": 0,
},
]
@pytest.fixture
def tag_map():
return {
@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler")
# initialize with patterns from misc registry
@registry.misc("attribute_ruler_patterns")
def attribute_ruler_patterns():
return [
{
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
},
# one pattern sets the lemma
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
# another pattern sets the morphology
{
"patterns": [[{"ORTH": "test"}]],
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
"index": 0,
},
]
nlp.config["initialize"]["components"]["attribute_ruler"] = {
"patterns": {"@misc": "attribute_ruler_patterns"}
}
@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
assert scores["lemma_acc"] == pytest.approx(0.2)
# no morphs are set
assert scores["morph_acc"] is None
nlp.remove_pipe("attribute_ruler")
# test with custom scorer
@registry.misc("weird_scorer.v1")
def make_weird_scorer():
def weird_scorer(examples, weird_score, **kwargs):
return {"weird_score": weird_score}
return weird_scorer
ruler = nlp.add_pipe(
"attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
)
ruler.initialize(lambda: [], patterns=pattern_dicts)
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
assert scores["weird_score"] == 0.12345
assert "token_acc" in scores
assert "lemma_acc" not in scores
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
assert scores["weird_score"] == 0.23456
def test_attributeruler_rule_order(nlp):

View File

@ -95,6 +95,7 @@ class registry(thinc.registry):
readers = catalogue.create("spacy", "readers", entry_points=True)
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
loggers = catalogue.create("spacy", "loggers", entry_points=True)
scorers = catalogue.create("spacy", "scorers", entry_points=True)
# These are factories registered via third-party packages and the
# spacy_factories entry point. This registry only exists so we can easily
# load them via the entry points. The "true" factories are added via the