diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 23c3ff485..945560aac 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES @@ -23,13 +23,25 @@ class Bengali(Language): @Bengali.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Bengali"] diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index 81f39b13c..15d395c12 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model @@ -28,13 +28,25 @@ class Catalan(Language): @Catalan.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return CatalanLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Catalan"] diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index be59a3500..e843114fc 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -28,13 +28,25 @@ class Greek(Language): @Greek.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return GreekLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Greek"] diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index eea522908..a84b50476 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -26,13 +26,25 @@ class English(Language): @English.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return EnglishLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["English"] diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 4b329b6f7..2f246a678 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS @@ -26,13 +26,25 @@ class Spanish(Language): @Spanish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return SpanishLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Spanish"] diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 77a0a28b9..0c3100f2b 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -26,13 +26,25 @@ class Persian(Language): @Persian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Persian"] diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index d69a5a718..254e1651b 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model @@ -31,13 +31,25 @@ class French(Language): @French.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return FrenchLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["French"] diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 672a8698e..fc74789a3 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .stop_words import STOP_WORDS @@ -23,13 +23,25 @@ class Italian(Language): @Italian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, + default_config={ + "model": None, + "mode": "pos_lookup", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return ItalianLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Italian"] diff --git a/spacy/lang/mk/__init__.py b/spacy/lang/mk/__init__.py index 2f6097f16..a8464f3b7 100644 --- a/spacy/lang/mk/__init__.py +++ b/spacy/lang/mk/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .lemmatizer import MacedonianLemmatizer from .stop_words import STOP_WORDS @@ -38,13 +38,25 @@ class Macedonian(Language): @Macedonian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return MacedonianLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Macedonian"] diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 0bfde7d28..d08f8f768 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES @@ -26,13 +26,25 @@ class Norwegian(Language): @Norwegian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Norwegian"] diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 5e95b4a8b..0a6480a1d 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model @@ -30,13 +30,25 @@ class Dutch(Language): @Dutch.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return DutchLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Dutch"] diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 585e08c60..1d71244a2 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model @@ -33,13 +33,25 @@ class Polish(Language): @Polish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, + default_config={ + "model": None, + "mode": "pos_lookup", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return PolishLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Polish"] diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 4287cc288..0f645ddb1 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .stop_words import STOP_WORDS @@ -22,7 +22,12 @@ class Russian(Language): @Russian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pymorphy2", "overwrite": False}, + default_config={ + "model": None, + "mode": "pymorphy2", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( @@ -31,8 +36,11 @@ def make_lemmatizer( name: str, mode: str, overwrite: bool, + scorer: Optional[Callable], ): - return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return RussianLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Russian"] diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 399cd174c..185e09718 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -1,8 +1,9 @@ -from typing import Optional, List, Dict, Tuple +from typing import Optional, List, Dict, Tuple, Callable from thinc.api import Model from ...pipeline import Lemmatizer +from ...pipeline.lemmatizer import lemmatizer_score from ...symbols import POS from ...tokens import Token from ...vocab import Vocab @@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer): *, mode: str = "pymorphy2", overwrite: bool = False, + scorer: Optional[Callable] = lemmatizer_score, ) -> None: if mode == "pymorphy2": try: @@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer): ) from None if getattr(self, "_morph", None) is None: self._morph = MorphAnalyzer() - super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) + super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer) def pymorphy2_lemmatize(self, token: Token) -> List[str]: string = token.text diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 1b1b69fac..aa8d3f110 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS @@ -29,13 +29,25 @@ class Swedish(Language): @Swedish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Swedish"] diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 677281ec6..2eef110b2 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model @@ -23,13 +23,25 @@ class Ukrainian(Language): @Ukrainian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pymorphy2", "overwrite": False}, + default_config={ + "model": None, + "mode": "pymorphy2", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return UkrainianLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Ukrainian"] diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 1fb030e06..fd566a3a8 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -1,8 +1,9 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from ..ru.lemmatizer import RussianLemmatizer +from ...pipeline.lemmatizer import lemmatizer_score from ...vocab import Vocab @@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer): *, mode: str = "pymorphy2", overwrite: bool = False, + scorer: Optional[Callable] = lemmatizer_score, ) -> None: if mode == "pymorphy2": try: @@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer): ) from None if getattr(self, "_morph", None) is None: self._morph = MorphAnalyzer(lang="uk") - super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) + super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index f95a5a48c..733a65199 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -5,15 +5,15 @@ from pathlib import Path from .pipe import Pipe from ..errors import Errors -from ..training import validate_examples, Example +from ..training import Example from ..language import Language from ..matcher import Matcher from ..scorer import Scorer -from ..symbols import IDS, TAG, POS, MORPH, LEMMA +from ..symbols import IDS from ..tokens import Doc, Span from ..tokens._retokenize import normalize_token_attrs, set_token_attrs from ..vocab import Vocab -from ..util import SimpleFrozenList +from ..util import SimpleFrozenList, registry from .. import util @@ -23,9 +23,43 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] -@Language.factory("attribute_ruler", default_config={"validate": False}) -def make_attribute_ruler(nlp: Language, name: str, validate: bool): - return AttributeRuler(nlp.vocab, name, validate=validate) +@Language.factory( + "attribute_ruler", + default_config={ + "validate": False, + "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"}, + }, +) +def make_attribute_ruler( + nlp: Language, name: str, validate: bool, scorer: Optional[Callable] +): + return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer) + + +def attribute_ruler_score( + examples: Iterable[Example], **kwargs +) -> Dict[str, Any]: + def morph_key_getter(token, attr): + return getattr(token, attr).key + + results = {} + results.update(Scorer.score_token_attr(examples, "tag", **kwargs)) + results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) + results.update( + Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs) + ) + results.update( + Scorer.score_token_attr_per_feat( + examples, "morph", getter=morph_key_getter, **kwargs + ) + ) + results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) + return results + + +@registry.scorers("spacy.attribute_ruler_scorer.v1") +def make_attribute_ruler_scorer(): + return attribute_ruler_score class AttributeRuler(Pipe): @@ -36,7 +70,12 @@ class AttributeRuler(Pipe): """ def __init__( - self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False + self, + vocab: Vocab, + name: str = "attribute_ruler", + *, + validate: bool = False, + scorer: Optional[Callable] = attribute_ruler_score, ) -> None: """Create the AttributeRuler. After creation, you can add patterns with the `.initialize()` or `.add_patterns()` methods, or load patterns @@ -57,6 +96,7 @@ class AttributeRuler(Pipe): self.attrs = [] self._attrs_unnormed = [] # store for reference self.indices = [] + self.scorer = scorer def clear(self) -> None: """Reset all patterns.""" @@ -228,45 +268,6 @@ class AttributeRuler(Pipe): all_patterns.append(p) return all_patterns - def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by - Scorer.score_token_attr for the attributes "tag", "pos", "morph" - and "lemma" for the target token attributes. - - DOCS: https://spacy.io/api/tagger#score - """ - - def morph_key_getter(token, attr): - return getattr(token, attr).key - - validate_examples(examples, "AttributeRuler.score") - results = {} - attrs = set() - for token_attrs in self.attrs: - attrs.update(token_attrs) - for attr in attrs: - if attr == TAG: - results.update(Scorer.score_token_attr(examples, "tag", **kwargs)) - elif attr == POS: - results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) - elif attr == MORPH: - results.update( - Scorer.score_token_attr( - examples, "morph", getter=morph_key_getter, **kwargs - ) - ) - results.update( - Scorer.score_token_attr_per_feat( - examples, "morph", getter=morph_key_getter, **kwargs - ) - ) - elif attr == LEMMA: - results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) - return results - def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: """Serialize the AttributeRuler to a bytestring. diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index be23ab0dd..59364326b 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -1,6 +1,6 @@ # cython: infer_types=True, profile=True, binding=True from collections import defaultdict -from typing import Optional, Iterable +from typing import Optional, Iterable, Callable from thinc.api import Model, Config from ._parser_internals.transition_system import TransitionSystem @@ -12,7 +12,7 @@ from ..language import Language from ._parser_internals import nonproj from ._parser_internals.nonproj import DELIMITER from ..scorer import Scorer -from ..training import validate_examples +from ..util import registry default_model_config = """ @@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, + "scorer": {"@scorers": "spacy.parser_scorer.v1"}, }, default_score_weights={ "dep_uas": 0.5, @@ -63,7 +64,8 @@ def make_parser( moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, learn_tokens: bool, - min_action_freq: int + min_action_freq: int, + scorer: Optional[Callable], ): """Create a transition-based DependencyParser component. The dependency parser jointly learns sentence segmentation and labelled dependency parsing, and can @@ -115,7 +117,8 @@ def make_parser( beam_update_prob=0.0, # At some point in the future we can try to implement support for # partial annotations, perhaps only in the beam objective. - incorrect_spans_key=None + incorrect_spans_key=None, + scorer=scorer, ) @Language.factory( @@ -130,6 +133,7 @@ def make_parser( "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, + "scorer": {"@scorers": "spacy.parser_scorer.v1"}, }, default_score_weights={ "dep_uas": 0.5, @@ -151,6 +155,7 @@ def make_beam_parser( beam_width: int, beam_density: float, beam_update_prob: float, + scorer: Optional[Callable], ): """Create a transition-based DependencyParser component that uses beam-search. The dependency parser jointly learns sentence segmentation and labelled @@ -207,10 +212,41 @@ def make_beam_parser( min_action_freq=min_action_freq, # At some point in the future we can try to implement support for # partial annotations, perhaps only in the beam objective. - incorrect_spans_key=None + incorrect_spans_key=None, + scorer=scorer, ) +def parser_score(examples, **kwargs): + """Score a batch of examples. + + examples (Iterable[Example]): The examples to score. + RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans + and Scorer.score_deps. + + DOCS: https://spacy.io/api/dependencyparser#score + """ + def has_sents(doc): + return doc.has_annotation("SENT_START") + + def dep_getter(token, attr): + dep = getattr(token, attr) + dep = token.vocab.strings.as_string(dep).lower() + return dep + results = {} + results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) + kwargs.setdefault("getter", dep_getter) + kwargs.setdefault("ignore_labels", ("p", "punct")) + results.update(Scorer.score_deps(examples, "dep", **kwargs)) + del results["sents_per_type"] + return results + + +@registry.scorers("spacy.parser_scorer.v1") +def make_parser_scorer(): + return parser_score + + cdef class DependencyParser(Parser): """Pipeline component for dependency parsing. @@ -233,6 +269,7 @@ cdef class DependencyParser(Parser): beam_update_prob=0.0, multitasks=tuple(), incorrect_spans_key=None, + scorer=parser_score, ): """Create a DependencyParser. """ @@ -249,6 +286,7 @@ cdef class DependencyParser(Parser): beam_update_prob=beam_update_prob, multitasks=multitasks, incorrect_spans_key=incorrect_spans_key, + scorer=scorer, ) @property @@ -281,31 +319,6 @@ cdef class DependencyParser(Parser): labels.add(label) return tuple(sorted(labels)) - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans - and Scorer.score_deps. - - DOCS: https://spacy.io/api/dependencyparser#score - """ - def has_sents(doc): - return doc.has_annotation("SENT_START") - - validate_examples(examples, "DependencyParser.score") - def dep_getter(token, attr): - dep = getattr(token, attr) - dep = token.vocab.strings.as_string(dep).lower() - return dep - results = {} - results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) - kwargs.setdefault("getter", dep_getter) - kwargs.setdefault("ignore_labels", ("p", "punct")) - results.update(Scorer.score_deps(examples, "dep", **kwargs)) - del results["sents_per_type"] - return results - def scored_parses(self, beams): """Return two dictionaries with scores for each beam/doc that was processed: one containing (i, head) keys, and another containing (i, label) keys. diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 7b52025bc..919d1fe6d 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -16,7 +16,7 @@ from ..language import Language from ..vocab import Vocab from ..training import Example, validate_examples, validate_get_examples from ..errors import Errors, Warnings -from ..util import SimpleFrozenList +from ..util import SimpleFrozenList, registry from .. import util from ..scorer import Scorer @@ -50,6 +50,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "incl_context": True, "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, + "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, }, default_score_weights={ "nel_micro_f": 1.0, @@ -68,6 +69,7 @@ def make_entity_linker( incl_context: bool, entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], + scorer: Optional[Callable], ): """Construct an EntityLinker component. @@ -92,9 +94,19 @@ def make_entity_linker( incl_context=incl_context, entity_vector_length=entity_vector_length, get_candidates=get_candidates, + scorer=scorer, ) +def entity_linker_score(examples, **kwargs): + return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs) + + +@registry.scorers("spacy.entity_linker_scorer.v1") +def make_entity_linker_scorer(): + return entity_linker_score + + class EntityLinker(TrainablePipe): """Pipeline component for named entity linking. @@ -115,6 +127,7 @@ class EntityLinker(TrainablePipe): incl_context: bool, entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], + scorer: Optional[Callable] = entity_linker_score, ) -> None: """Initialize an entity linker. @@ -145,6 +158,7 @@ class EntityLinker(TrainablePipe): # how many neighbour sentences to take into account # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. self.kb = empty_kb(entity_vector_length)(self.vocab) + self.scorer = scorer def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will @@ -389,17 +403,6 @@ class EntityLinker(TrainablePipe): for token in ent: token.ent_kb_id_ = kb_id - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores. - - DOCS TODO: https://spacy.io/api/entity_linker#score - """ - validate_examples(examples, "EntityLinker.score") - return Scorer.score_links(examples, negative_labels=[self.NIL]) - def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 1dea8fba0..d2a0c5045 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -8,11 +8,10 @@ from .pipe import Pipe from ..training import Example from ..language import Language from ..errors import Errors, Warnings -from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList +from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher from ..scorer import get_ner_prf -from ..training import validate_examples DEFAULT_ENT_ID_SEP = "||" @@ -27,6 +26,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] "validate": False, "overwrite_ents": False, "ent_id_sep": DEFAULT_ENT_ID_SEP, + "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, }, default_score_weights={ "ents_f": 1.0, @@ -42,6 +42,7 @@ def make_entity_ruler( validate: bool, overwrite_ents: bool, ent_id_sep: str, + scorer: Optional[Callable], ): return EntityRuler( nlp, @@ -50,9 +51,19 @@ def make_entity_ruler( validate=validate, overwrite_ents=overwrite_ents, ent_id_sep=ent_id_sep, + scorer=scorer, ) +def entity_ruler_score(examples, **kwargs): + return get_ner_prf(examples) + + +@registry.scorers("spacy.entity_ruler_scorer.v1") +def make_entity_ruler_scorer(): + return entity_ruler_score + + class EntityRuler(Pipe): """The EntityRuler lets you add spans to the `Doc.ents` using token-based rules or exact phrase matches. It can be combined with the statistical @@ -74,6 +85,7 @@ class EntityRuler(Pipe): overwrite_ents: bool = False, ent_id_sep: str = DEFAULT_ENT_ID_SEP, patterns: Optional[List[PatternType]] = None, + scorer: Optional[Callable] = entity_ruler_score, ) -> None: """Initialize the entity ruler. If patterns are supplied here, they need to be a list of dictionaries with a `"label"` and `"pattern"` @@ -112,6 +124,7 @@ class EntityRuler(Pipe): self._ent_ids = defaultdict(dict) if patterns is not None: self.add_patterns(patterns) + self.scorer = scorer def __len__(self) -> int: """The number of all patterns added to the entity ruler.""" @@ -358,10 +371,6 @@ class EntityRuler(Pipe): label = f"{label}{self.ent_id_sep}{ent_id}" return label - def score(self, examples, **kwargs): - validate_examples(examples, "EntityRuler.score") - return get_ner_prf(examples) - def from_bytes( self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() ) -> "EntityRuler": diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 2f436c57a..1bf513661 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups from ..scorer import Scorer from ..tokens import Doc, Token from ..vocab import Vocab -from ..training import validate_examples -from ..util import logger, SimpleFrozenList +from ..util import logger, SimpleFrozenList, registry from .. import util @Language.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "lookup", "overwrite": False}, + default_config={ + "model": None, + "mode": "lookup", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) + + +def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + return Scorer.score_token_attr(examples, "lemma", **kwargs) + + +@registry.scorers("spacy.lemmatizer_scorer.v1") +def make_lemmatizer_scorer(): + return lemmatizer_score class Lemmatizer(Pipe): @@ -60,6 +80,7 @@ class Lemmatizer(Pipe): *, mode: str = "lookup", overwrite: bool = False, + scorer: Optional[Callable] = lemmatizer_score, ) -> None: """Initialize a Lemmatizer. @@ -89,6 +110,7 @@ class Lemmatizer(Pipe): raise ValueError(Errors.E1003.format(mode=mode)) self.lemmatize = getattr(self, mode_attr) self.cache = {} + self.scorer = scorer @property def mode(self): @@ -247,17 +269,6 @@ class Lemmatizer(Pipe): """ return False - def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores. - - DOCS: https://spacy.io/api/lemmatizer#score - """ - validate_examples(examples, "Lemmatizer.score") - return Scorer.score_token_attr(examples, "lemma", **kwargs) - def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ): diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 3ba05e616..c5293e860 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,5 +1,5 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Optional, Union, Dict +from typing import Optional, Union, Dict, Callable import srsly from thinc.api import SequenceCategoricalCrossentropy, Model, Config from itertools import islice @@ -17,6 +17,7 @@ from .tagger import Tagger from .. import util from ..scorer import Scorer from ..training import validate_examples, validate_get_examples +from ..util import registry default_model_config = """ @@ -48,15 +49,33 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "morphologizer", assigns=["token.morph", "token.pos"], - default_config={"model": DEFAULT_MORPH_MODEL}, + default_config={"model": DEFAULT_MORPH_MODEL, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}}, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( nlp: Language, model: Model, name: str, + scorer: Optional[Callable], ): - return Morphologizer(nlp.vocab, model, name) + return Morphologizer(nlp.vocab, model, name, scorer=scorer) + + +def morphologizer_score(examples, **kwargs): + def morph_key_getter(token, attr): + return getattr(token, attr).key + + results = {} + results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) + results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) + results.update(Scorer.score_token_attr_per_feat(examples, + "morph", getter=morph_key_getter, **kwargs)) + return results + + +@registry.scorers("spacy.morphologizer_scorer.v1") +def make_morphologizer_scorer(): + return morphologizer_score class Morphologizer(Tagger): @@ -67,6 +86,8 @@ class Morphologizer(Tagger): vocab: Vocab, model: Model, name: str = "morphologizer", + *, + scorer: Optional[Callable] = morphologizer_score, ): """Initialize a morphologizer. @@ -87,6 +108,7 @@ class Morphologizer(Tagger): # 2) labels_pos stores a mapping from morph+POS->POS cfg = {"labels_morph": {}, "labels_pos": {}} self.cfg = dict(sorted(cfg.items())) + self.scorer = scorer @property def labels(self): @@ -246,24 +268,3 @@ class Morphologizer(Tagger): if self.model.ops.xp.isnan(loss): raise ValueError(Errors.E910.format(name=self.name)) return float(loss), d_scores - - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by - Scorer.score_token_attr for the attributes "pos" and "morph" and - Scorer.score_token_attr_per_feat for the attribute "morph". - - DOCS: https://spacy.io/api/morphologizer#score - """ - def morph_key_getter(token, attr): - return getattr(token, attr).key - - validate_examples(examples, "Morphologizer.score") - results = {} - results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) - results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) - results.update(Scorer.score_token_attr_per_feat(examples, - "morph", getter=morph_key_getter, **kwargs)) - return results diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index f4ae4b787..857e3c088 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -1,6 +1,6 @@ # cython: infer_types=True, profile=True, binding=True from collections import defaultdict -from typing import Optional, Iterable +from typing import Optional, Iterable, Callable from thinc.api import Model, Config from ._parser_internals.transition_system import TransitionSystem @@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown from ..language import Language from ..scorer import get_ner_prf, PRFScore -from ..training import validate_examples +from ..util import registry default_model_config = """ @@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "moves": None, "update_with_oracle_cut_size": 100, "model": DEFAULT_NER_MODEL, - "incorrect_spans_key": None + "incorrect_spans_key": None, + "scorer": {"@scorers": "spacy.ner_scorer.v1"}, }, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, @@ -52,7 +53,8 @@ def make_ner( model: Model, moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, - incorrect_spans_key: Optional[str]=None + incorrect_spans_key: Optional[str], + scorer: Optional[Callable], ): """Create a transition-based EntityRecognizer component. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -92,6 +94,7 @@ def make_ner( beam_width=1, beam_density=0.0, beam_update_prob=0.0, + scorer=scorer, ) @Language.factory( @@ -104,7 +107,8 @@ def make_ner( "beam_density": 0.01, "beam_update_prob": 0.5, "beam_width": 32, - "incorrect_spans_key": None + "incorrect_spans_key": None, + "scorer": None, }, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, ) @@ -117,7 +121,8 @@ def make_beam_ner( beam_width: int, beam_density: float, beam_update_prob: float, - incorrect_spans_key: Optional[str]=None + incorrect_spans_key: Optional[str], + scorer: Optional[Callable], ): """Create a transition-based EntityRecognizer component that uses beam-search. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -164,10 +169,20 @@ def make_beam_ner( beam_width=beam_width, beam_density=beam_density, beam_update_prob=beam_update_prob, - incorrect_spans_key=incorrect_spans_key + incorrect_spans_key=incorrect_spans_key, + scorer=scorer, ) +def ner_score(examples, **kwargs): + return get_ner_prf(examples, **kwargs) + + +@registry.scorers("spacy.ner_scorer.v1") +def make_ner_scorer(): + return ner_score + + cdef class EntityRecognizer(Parser): """Pipeline component for named entity recognition. @@ -188,6 +203,7 @@ cdef class EntityRecognizer(Parser): beam_update_prob=0.0, multitasks=tuple(), incorrect_spans_key=None, + scorer=ner_score, ): """Create an EntityRecognizer. """ @@ -204,6 +220,7 @@ cdef class EntityRecognizer(Parser): beam_update_prob=beam_update_prob, multitasks=multitasks, incorrect_spans_key=incorrect_spans_key, + scorer=scorer, ) def add_multitask_objective(self, mt_component): @@ -227,17 +244,6 @@ cdef class EntityRecognizer(Parser): if move[0] in ("B", "I", "L", "U")) return tuple(sorted(labels)) - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The NER precision, recall and f-scores. - - DOCS: https://spacy.io/api/entityrecognizer#score - """ - validate_examples(examples, "EntityRecognizer.score") - return get_ner_prf(examples) - def scored_ents(self, beams): """Return a dictionary of (start, end, label) tuples with corresponding scores for each beam/doc that was processed. diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 0d298ce4f..14f9f08f8 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -81,6 +81,17 @@ cdef class Pipe: DOCS: https://spacy.io/api/pipe#score """ + if hasattr(self, "scorer") and self.scorer is not None: + scorer_kwargs = {} + # use default settings from cfg (e.g., threshold) + if hasattr(self, "cfg") and isinstance(self.cfg, dict): + scorer_kwargs.update(self.cfg) + # override self.cfg["labels"] with self.labels + if hasattr(self, "labels"): + scorer_kwargs["labels"] = self.labels + # override with kwargs settings + scorer_kwargs.update(kwargs) + return self.scorer(examples, **scorer_kwargs) return {} @property diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 60102efcb..c535c7e48 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -1,26 +1,29 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Optional, List +from typing import Optional, List, Callable import srsly from ..tokens.doc cimport Doc + from .pipe import Pipe +from .senter import senter_score from ..language import Language from ..scorer import Scorer -from ..training import validate_examples from .. import util + @Language.factory( "sentencizer", assigns=["token.is_sent_start", "doc.sents"], - default_config={"punct_chars": None}, + default_config={"punct_chars": None, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) def make_sentencizer( nlp: Language, name: str, - punct_chars: Optional[List[str]] + punct_chars: Optional[List[str]], + scorer: Optional[Callable], ): - return Sentencizer(name, punct_chars=punct_chars) + return Sentencizer(name, punct_chars=punct_chars, scorer=scorer) class Sentencizer(Pipe): @@ -41,7 +44,13 @@ class Sentencizer(Pipe): '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。'] - def __init__(self, name="sentencizer", *, punct_chars=None): + def __init__( + self, + name="sentencizer", + *, + punct_chars=None, + scorer=senter_score, + ): """Initialize the sentencizer. punct_chars (list): Punctuation characters to split on. Will be @@ -55,6 +64,7 @@ class Sentencizer(Pipe): self.punct_chars = set(punct_chars) else: self.punct_chars = set(self.default_punct_chars) + self.scorer = scorer def __call__(self, doc): """Apply the sentencizer to a Doc and set Token.is_sent_start. @@ -122,22 +132,6 @@ class Sentencizer(Pipe): else: doc.c[j].sent_start = -1 - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. - - DOCS: https://spacy.io/api/sentencizer#score - """ - def has_sents(doc): - return doc.has_annotation("SENT_START") - - validate_examples(examples, "Sentencizer.score") - results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) - del results["sents_per_type"] - return results - def to_bytes(self, *, exclude=tuple()): """Serialize the sentencizer to a bytestring. diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index f9472abf5..3eeb9b5da 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,5 +1,6 @@ # cython: infer_types=True, profile=True, binding=True from itertools import islice +from typing import Optional, Callable import srsly from thinc.api import Model, SequenceCategoricalCrossentropy, Config @@ -11,6 +12,7 @@ from ..language import Language from ..errors import Errors from ..scorer import Scorer from ..training import validate_examples, validate_get_examples +from ..util import registry from .. import util @@ -34,11 +36,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "senter", assigns=["token.is_sent_start"], - default_config={"model": DEFAULT_SENTER_MODEL}, + default_config={"model": DEFAULT_SENTER_MODEL, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) -def make_senter(nlp: Language, name: str, model: Model): - return SentenceRecognizer(nlp.vocab, model, name) +def make_senter(nlp: Language, name: str, model: Model, scorer: Optional[Callable]): + return SentenceRecognizer(nlp.vocab, model, name, scorer=scorer) + + +def senter_score(examples, **kwargs): + def has_sents(doc): + return doc.has_annotation("SENT_START") + + results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) + del results["sents_per_type"] + return results + + +@registry.scorers("spacy.senter_scorer.v1") +def make_senter_scorer(): + return senter_score class SentenceRecognizer(Tagger): @@ -46,7 +62,7 @@ class SentenceRecognizer(Tagger): DOCS: https://spacy.io/api/sentencerecognizer """ - def __init__(self, vocab, model, name="senter"): + def __init__(self, vocab, model, name="senter", *, scorer=senter_score): """Initialize a sentence recognizer. vocab (Vocab): The shared vocabulary. @@ -61,6 +77,7 @@ class SentenceRecognizer(Tagger): self.name = name self._rehearsal_model = None self.cfg = {} + self.scorer = scorer @property def labels(self): @@ -153,18 +170,3 @@ class SentenceRecognizer(Tagger): def add_label(self, label, values=None): raise NotImplementedError - - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. - DOCS: https://spacy.io/api/sentencerecognizer#score - """ - def has_sents(doc): - return doc.has_annotation("SENT_START") - - validate_examples(examples, "SentenceRecognizer.score") - results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) - del results["sents_per_type"] - return results diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 8d1be06c3..a143ac4eb 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -98,6 +98,7 @@ def build_ngram_range_suggester( "max_positive": None, "model": DEFAULT_SPANCAT_MODEL, "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, + "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, }, default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, ) @@ -107,8 +108,9 @@ def make_spancat( suggester: Callable[[List[Doc]], Ragged], model: Model[Tuple[List[Doc], Ragged], Floats2d], spans_key: str, - threshold: float = 0.5, - max_positive: Optional[int] = None, + scorer: Optional[Callable], + threshold: float, + max_positive: Optional[int], ) -> "SpanCategorizer": """Create a SpanCategorizer component. The span categorizer consists of two parts: a suggester function that proposes candidate spans, and a labeller @@ -138,9 +140,28 @@ def make_spancat( threshold=threshold, max_positive=max_positive, name=name, + scorer=scorer, ) +def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + kwargs = dict(kwargs) + attr_prefix = "spans_" + key = kwargs["spans_key"] + kwargs.setdefault("attr", f"{attr_prefix}{key}") + kwargs.setdefault("allow_overlap", True) + kwargs.setdefault( + "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], []) + ) + kwargs.setdefault("has_annotation", lambda doc: key in doc.spans) + return Scorer.score_spans(examples, **kwargs) + + +@registry.scorers("spacy.spancat_scorer.v1") +def make_spancat_scorer(): + return spancat_score + + class SpanCategorizer(TrainablePipe): """Pipeline component to label spans of text. @@ -157,6 +178,7 @@ class SpanCategorizer(TrainablePipe): spans_key: str = "spans", threshold: float = 0.5, max_positive: Optional[int] = None, + scorer: Optional[Callable] = spancat_score, ) -> None: """Initialize the span categorizer. @@ -172,6 +194,7 @@ class SpanCategorizer(TrainablePipe): self.suggester = suggester self.model = model self.name = name + self.scorer = scorer @property def key(self) -> str: @@ -373,28 +396,6 @@ class SpanCategorizer(TrainablePipe): else: self.model.initialize() - def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. - - DOCS: https://spacy.io/api/spancategorizer#score - """ - validate_examples(examples, "SpanCategorizer.score") - self._validate_categories(examples) - kwargs = dict(kwargs) - attr_prefix = "spans_" - kwargs.setdefault("attr", f"{attr_prefix}{self.key}") - kwargs.setdefault("labels", self.labels) - kwargs.setdefault("multi_label", True) - kwargs.setdefault("threshold", self.cfg["threshold"]) - kwargs.setdefault( - "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], []) - ) - kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans) - return Scorer.score_spans(examples, **kwargs) - def _validate_categories(self, examples): # TODO pass diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index fa260bdd6..327a18f25 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True, profile=True, binding=True +from typing import Callable, Optional import numpy import srsly from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config @@ -18,6 +19,7 @@ from ..parts_of_speech import X from ..errors import Errors, Warnings from ..scorer import Scorer from ..training import validate_examples, validate_get_examples +from ..util import registry from .. import util @@ -41,10 +43,10 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tagger", assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL}, + default_config={"model": DEFAULT_TAGGER_MODEL, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}}, default_score_weights={"tag_acc": 1.0}, ) -def make_tagger(nlp: Language, name: str, model: Model): +def make_tagger(nlp: Language, name: str, model: Model, scorer: Optional[Callable]): """Construct a part-of-speech tagger component. model (Model[List[Doc], List[Floats2d]]): A model instance that predicts @@ -52,7 +54,16 @@ def make_tagger(nlp: Language, name: str, model: Model): in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to 1). """ - return Tagger(nlp.vocab, model, name) + return Tagger(nlp.vocab, model, name, scorer=scorer) + + +def tagger_score(examples, **kwargs): + return Scorer.score_token_attr(examples, "tag", **kwargs) + + +@registry.scorers("spacy.tagger_scorer.v1") +def make_tagger_scorer(): + return tagger_score class Tagger(TrainablePipe): @@ -60,7 +71,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger """ - def __init__(self, vocab, model, name="tagger"): + def __init__(self, vocab, model, name="tagger", *, scorer=tagger_score): """Initialize a part-of-speech tagger. vocab (Vocab): The shared vocabulary. @@ -76,6 +87,7 @@ class Tagger(TrainablePipe): self._rehearsal_model = None cfg = {"labels": []} self.cfg = dict(sorted(cfg.items())) + self.scorer = scorer @property def labels(self): @@ -289,15 +301,3 @@ class Tagger(TrainablePipe): self.cfg["labels"].append(label) self.vocab.strings.add(label) return 1 - - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by - Scorer.score_token_attr for the attributes "tag". - - DOCS: https://spacy.io/api/tagger#score - """ - validate_examples(examples, "Tagger.score") - return Scorer.score_token_attr(examples, "tag", **kwargs) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 0dde5de82..5ede18424 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples from ..errors import Errors from ..scorer import Scorer from ..tokens import Doc +from ..util import registry from ..vocab import Vocab @@ -70,7 +71,11 @@ subword_features = true @Language.factory( "textcat", assigns=["doc.cats"], - default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL}, + default_config={ + "threshold": 0.5, + "model": DEFAULT_SINGLE_TEXTCAT_MODEL, + "scorer": {"@scorers": "spacy.textcat_scorer.v1"}, + }, default_score_weights={ "cats_score": 1.0, "cats_score_desc": None, @@ -86,7 +91,11 @@ subword_features = true }, ) def make_textcat( - nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float + nlp: Language, + name: str, + model: Model[List[Doc], List[Floats2d]], + threshold: float, + scorer: Optional[Callable], ) -> "TextCategorizer": """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered @@ -96,7 +105,21 @@ def make_textcat( scores for each category. threshold (float): Cutoff to consider a prediction "positive". """ - return TextCategorizer(nlp.vocab, model, name, threshold=threshold) + return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer) + + +def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + return Scorer.score_cats( + examples, + "cats", + multi_label=False, + **kwargs, + ) + + +@registry.scorers("spacy.textcat_scorer.v1") +def make_textcat_scorer(): + return textcat_score class TextCategorizer(TrainablePipe): @@ -106,7 +129,13 @@ class TextCategorizer(TrainablePipe): """ def __init__( - self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float + self, + vocab: Vocab, + model: Model, + name: str = "textcat", + *, + threshold: float, + scorer: Optional[Callable] = textcat_score, ) -> None: """Initialize a text categorizer for single-label classification. @@ -124,6 +153,7 @@ class TextCategorizer(TrainablePipe): self._rehearsal_model = None cfg = {"labels": [], "threshold": threshold, "positive_label": None} self.cfg = dict(cfg) + self.scorer = scorer @property def labels(self) -> Tuple[str]: @@ -354,26 +384,6 @@ class TextCategorizer(TrainablePipe): assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. - - DOCS: https://spacy.io/api/textcategorizer#score - """ - validate_examples(examples, "TextCategorizer.score") - self._validate_categories(examples) - kwargs.setdefault("threshold", self.cfg["threshold"]) - kwargs.setdefault("positive_label", self.cfg["positive_label"]) - return Scorer.score_cats( - examples, - "cats", - labels=self.labels, - multi_label=False, - **kwargs, - ) - def _validate_categories(self, examples: List[Example]): """Check whether the provided examples all have single-label cats annotations.""" for ex in examples: diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index ba36881af..efa7d28b5 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -5,10 +5,11 @@ from thinc.api import Model, Config from thinc.types import Floats2d from ..language import Language -from ..training import Example, validate_examples, validate_get_examples +from ..training import Example, validate_get_examples from ..errors import Errors from ..scorer import Scorer from ..tokens import Doc +from ..util import registry from ..vocab import Vocab from .textcat import TextCategorizer @@ -70,7 +71,11 @@ subword_features = true @Language.factory( "textcat_multilabel", assigns=["doc.cats"], - default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL}, + default_config={ + "threshold": 0.5, + "model": DEFAULT_MULTI_TEXTCAT_MODEL, + "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"}, + }, default_score_weights={ "cats_score": 1.0, "cats_score_desc": None, @@ -86,7 +91,11 @@ subword_features = true }, ) def make_multilabel_textcat( - nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float + nlp: Language, + name: str, + model: Model[List[Doc], List[Floats2d]], + threshold: float, + scorer: Optional[Callable], ) -> "TextCategorizer": """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered @@ -97,7 +106,23 @@ def make_multilabel_textcat( scores for each category. threshold (float): Cutoff to consider a prediction "positive". """ - return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold) + return MultiLabel_TextCategorizer( + nlp.vocab, model, name, threshold=threshold, scorer=scorer + ) + + +def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + return Scorer.score_cats( + examples, + "cats", + multi_label=True, + **kwargs, + ) + + +@registry.scorers("spacy.textcat_multilabel_scorer.v1") +def make_textcat_multilabel_scorer(): + return textcat_multilabel_score class MultiLabel_TextCategorizer(TextCategorizer): @@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer): name: str = "textcat_multilabel", *, threshold: float, + scorer: Optional[Callable] = textcat_multilabel_score, ) -> None: """Initialize a text categorizer for multi-label classification. @@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer): self._rehearsal_model = None cfg = {"labels": [], "threshold": threshold} self.cfg = dict(cfg) + self.scorer = scorer def initialize( self, @@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer): assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. - - DOCS: https://spacy.io/api/textcategorizer#score - """ - validate_examples(examples, "MultiLabel_TextCategorizer.score") - kwargs.setdefault("threshold", self.cfg["threshold"]) - return Scorer.score_cats( - examples, - "cats", - labels=self.labels, - multi_label=True, - **kwargs, - ) - def _validate_categories(self, examples: List[Example]): """This component allows any type of single- or multi-label annotations. This method overwrites the more strict one from 'textcat'.""" diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd index d5cdbb511..65daa8b22 100644 --- a/spacy/pipeline/trainable_pipe.pxd +++ b/spacy/pipeline/trainable_pipe.pxd @@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe): cdef public Vocab vocab cdef public object model cdef public object cfg + cdef public object scorer diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 5e11f5972..fa2b28aa5 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe): beam_density=0.0, beam_update_prob=0.0, multitasks=tuple(), - incorrect_spans_key=None + incorrect_spans_key=None, + scorer=None, ): """Create a Parser. @@ -117,6 +118,7 @@ cdef class Parser(TrainablePipe): self.add_multitask_objective(multitask) self._rehearsal_model = None + self.scorer = scorer def __getnewargs_ex__(self): """This allows pickling the Parser and its keyword-only init arguments""" diff --git a/spacy/scorer.py b/spacy/scorer.py index f4ccb2269..bd305c123 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -537,7 +537,7 @@ class Scorer: @staticmethod def score_links( - examples: Iterable[Example], *, negative_labels: Iterable[str] + examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg ) -> Dict[str, Any]: """Returns PRF for predicted links on the entity level. To disentangle the performance of the NEL from the NER, @@ -711,7 +711,7 @@ class Scorer: } -def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]: +def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Compute micro-PRF and per-entity PRF scores for a sequence of examples.""" score_per_type = defaultdict(PRFScore) for eg in examples: diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index 9c750ffd0..dab3ebf57 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -32,24 +32,6 @@ def pattern_dicts(): ] -@registry.misc("attribute_ruler_patterns") -def attribute_ruler_patterns(): - return [ - { - "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]], - "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, - }, - # one pattern sets the lemma - {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}}, - # another pattern sets the morphology - { - "patterns": [[{"ORTH": "test"}]], - "attrs": {"MORPH": "Case=Nom|Number=Sing"}, - "index": 0, - }, - ] - - @pytest.fixture def tag_map(): return { @@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): assert doc.has_annotation("LEMMA") assert doc.has_annotation("MORPH") nlp.remove_pipe("attribute_ruler") + # initialize with patterns from misc registry + @registry.misc("attribute_ruler_patterns") + def attribute_ruler_patterns(): + return [ + { + "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]], + "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, + }, + # one pattern sets the lemma + {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}}, + # another pattern sets the morphology + { + "patterns": [[{"ORTH": "test"}]], + "attrs": {"MORPH": "Case=Nom|Number=Sing"}, + "index": 0, + }, + ] + nlp.config["initialize"]["components"]["attribute_ruler"] = { "patterns": {"@misc": "attribute_ruler_patterns"} } @@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts): assert scores["lemma_acc"] == pytest.approx(0.2) # no morphs are set assert scores["morph_acc"] is None + nlp.remove_pipe("attribute_ruler") + + # test with custom scorer + @registry.misc("weird_scorer.v1") + def make_weird_scorer(): + def weird_scorer(examples, weird_score, **kwargs): + return {"weird_score": weird_score} + + return weird_scorer + + ruler = nlp.add_pipe( + "attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}} + ) + ruler.initialize(lambda: [], patterns=pattern_dicts) + scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345}) + assert scores["weird_score"] == 0.12345 + assert "token_acc" in scores + assert "lemma_acc" not in scores + scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456}) + assert scores["weird_score"] == 0.23456 def test_attributeruler_rule_order(nlp): diff --git a/spacy/util.py b/spacy/util.py index 421287ce2..5270bf080 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -95,6 +95,7 @@ class registry(thinc.registry): readers = catalogue.create("spacy", "readers", entry_points=True) augmenters = catalogue.create("spacy", "augmenters", entry_points=True) loggers = catalogue.create("spacy", "loggers", entry_points=True) + scorers = catalogue.create("spacy", "scorers", entry_points=True) # These are factories registered via third-party packages and the # spacy_factories entry point. This registry only exists so we can easily # load them via the entry points. The "true" factories are added via the