Refactor scoring methods to use registered functions (#8766)

* Add scorer option to components

Add an optional `scorer` parameter to all pipeline components. If a
scoring function is provided, it overrides the default scoring method
for that component.

* Add registered scorers for all components

* Add `scorers` registry
* Move all scoring methods outside of components as independent
  functions and register
* Use the registered scoring methods as defaults in configs and inits

Additional:

* The scoring methods no longer have access to the full component, so
  use settings from `cfg` as default scorer options to handle settings
  such as `labels`, `threshold`, and `positive_label`
* The `attribute_ruler` scoring method no longer has access to the
  patterns, so all scoring methods are called
* Bug fix: `spancat` scoring method is updated to set `allow_overlap` to
  score overlapping spans correctly

* Update Russian lemmatizer to use direct score method

* Check type of cfg in Pipe.score

* Fix check

* Update spacy/pipeline/sentencizer.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove validate_examples from scoring functions

* Use Pipe.labels instead of Pipe.cfg["labels"]

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Adriane Boyd 2021-08-10 15:13:39 +02:00 committed by GitHub
parent ee011ca963
commit f99d6d5e39
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
36 changed files with 638 additions and 363 deletions

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
@ -23,13 +23,25 @@ class Bengali(Language):
@Bengali.factory( @Bengali.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Bengali"] __all__ = ["Bengali"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
@ -28,13 +28,25 @@ class Catalan(Language):
@Catalan.factory( @Catalan.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return CatalanLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Catalan"] __all__ = ["Catalan"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -28,13 +28,25 @@ class Greek(Language):
@Greek.factory( @Greek.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return GreekLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Greek"] __all__ = ["Greek"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -26,13 +26,25 @@ class English(Language):
@English.factory( @English.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return EnglishLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["English"] __all__ = ["English"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -26,13 +26,25 @@ class Spanish(Language):
@Spanish.factory( @Spanish.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return SpanishLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Spanish"] __all__ = ["Spanish"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
@ -26,13 +26,25 @@ class Persian(Language):
@Persian.factory( @Persian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Persian"] __all__ = ["Persian"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
@ -31,13 +31,25 @@ class French(Language):
@French.factory( @French.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return FrenchLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["French"] __all__ = ["French"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -23,13 +23,25 @@ class Italian(Language):
@Italian.factory( @Italian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, default_config={
"model": None,
"mode": "pos_lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return ItalianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Italian"] __all__ = ["Italian"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .lemmatizer import MacedonianLemmatizer from .lemmatizer import MacedonianLemmatizer
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -38,13 +38,25 @@ class Macedonian(Language):
@Macedonian.factory( @Macedonian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return MacedonianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Macedonian"] __all__ = ["Macedonian"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -26,13 +26,25 @@ class Norwegian(Language):
@Norwegian.factory( @Norwegian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Norwegian"] __all__ = ["Norwegian"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
@ -30,13 +30,25 @@ class Dutch(Language):
@Dutch.factory( @Dutch.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return DutchLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Dutch"] __all__ = ["Dutch"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
@ -33,13 +33,25 @@ class Polish(Language):
@Polish.factory( @Polish.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, default_config={
"model": None,
"mode": "pos_lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return PolishLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Polish"] __all__ = ["Polish"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -22,7 +22,12 @@ class Russian(Language):
@Russian.factory( @Russian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "overwrite": False}, default_config={
"model": None,
"mode": "pymorphy2",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
@ -31,8 +36,11 @@ def make_lemmatizer(
name: str, name: str,
mode: str, mode: str,
overwrite: bool, overwrite: bool,
scorer: Optional[Callable],
): ):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return RussianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Russian"] __all__ = ["Russian"]

View File

@ -1,8 +1,9 @@
from typing import Optional, List, Dict, Tuple from typing import Optional, List, Dict, Tuple, Callable
from thinc.api import Model from thinc.api import Model
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...pipeline.lemmatizer import lemmatizer_score
from ...symbols import POS from ...symbols import POS
from ...tokens import Token from ...tokens import Token
from ...vocab import Vocab from ...vocab import Vocab
@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
*, *,
mode: str = "pymorphy2", mode: str = "pymorphy2",
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
if mode == "pymorphy2": if mode == "pymorphy2":
try: try:
@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer):
) from None ) from None
if getattr(self, "_morph", None) is None: if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer() self._morph = MorphAnalyzer()
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
def pymorphy2_lemmatize(self, token: Token) -> List[str]: def pymorphy2_lemmatize(self, token: Token) -> List[str]:
string = token.text string = token.text

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -29,13 +29,25 @@ class Swedish(Language):
@Swedish.factory( @Swedish.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Swedish"] __all__ = ["Swedish"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
@ -23,13 +23,25 @@ class Ukrainian(Language):
@Ukrainian.factory( @Ukrainian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "overwrite": False}, default_config={
"model": None,
"mode": "pymorphy2",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return UkrainianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Ukrainian"] __all__ = ["Ukrainian"]

View File

@ -1,8 +1,9 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from ..ru.lemmatizer import RussianLemmatizer from ..ru.lemmatizer import RussianLemmatizer
from ...pipeline.lemmatizer import lemmatizer_score
from ...vocab import Vocab from ...vocab import Vocab
@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
*, *,
mode: str = "pymorphy2", mode: str = "pymorphy2",
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
if mode == "pymorphy2": if mode == "pymorphy2":
try: try:
@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer):
) from None ) from None
if getattr(self, "_morph", None) is None: if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk") self._morph = MorphAnalyzer(lang="uk")
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)

View File

@ -5,15 +5,15 @@ from pathlib import Path
from .pipe import Pipe from .pipe import Pipe
from ..errors import Errors from ..errors import Errors
from ..training import validate_examples, Example from ..training import Example
from ..language import Language from ..language import Language
from ..matcher import Matcher from ..matcher import Matcher
from ..scorer import Scorer from ..scorer import Scorer
from ..symbols import IDS, TAG, POS, MORPH, LEMMA from ..symbols import IDS
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
from ..vocab import Vocab from ..vocab import Vocab
from ..util import SimpleFrozenList from ..util import SimpleFrozenList, registry
from .. import util from .. import util
@ -23,9 +23,43 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
@Language.factory("attribute_ruler", default_config={"validate": False}) @Language.factory(
def make_attribute_ruler(nlp: Language, name: str, validate: bool): "attribute_ruler",
return AttributeRuler(nlp.vocab, name, validate=validate) default_config={
"validate": False,
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
},
)
def make_attribute_ruler(
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
):
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
def attribute_ruler_score(
examples: Iterable[Example], **kwargs
) -> Dict[str, Any]:
def morph_key_getter(token, attr):
return getattr(token, attr).key
results = {}
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(
Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
)
results.update(
Scorer.score_token_attr_per_feat(
examples, "morph", getter=morph_key_getter, **kwargs
)
)
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results
@registry.scorers("spacy.attribute_ruler_scorer.v1")
def make_attribute_ruler_scorer():
return attribute_ruler_score
class AttributeRuler(Pipe): class AttributeRuler(Pipe):
@ -36,7 +70,12 @@ class AttributeRuler(Pipe):
""" """
def __init__( def __init__(
self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False self,
vocab: Vocab,
name: str = "attribute_ruler",
*,
validate: bool = False,
scorer: Optional[Callable] = attribute_ruler_score,
) -> None: ) -> None:
"""Create the AttributeRuler. After creation, you can add patterns """Create the AttributeRuler. After creation, you can add patterns
with the `.initialize()` or `.add_patterns()` methods, or load patterns with the `.initialize()` or `.add_patterns()` methods, or load patterns
@ -57,6 +96,7 @@ class AttributeRuler(Pipe):
self.attrs = [] self.attrs = []
self._attrs_unnormed = [] # store for reference self._attrs_unnormed = [] # store for reference
self.indices = [] self.indices = []
self.scorer = scorer
def clear(self) -> None: def clear(self) -> None:
"""Reset all patterns.""" """Reset all patterns."""
@ -228,45 +268,6 @@ class AttributeRuler(Pipe):
all_patterns.append(p) all_patterns.append(p)
return all_patterns return all_patterns
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
and "lemma" for the target token attributes.
DOCS: https://spacy.io/api/tagger#score
"""
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "AttributeRuler.score")
results = {}
attrs = set()
for token_attrs in self.attrs:
attrs.update(token_attrs)
for attr in attrs:
if attr == TAG:
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
elif attr == POS:
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
elif attr == MORPH:
results.update(
Scorer.score_token_attr(
examples, "morph", getter=morph_key_getter, **kwargs
)
)
results.update(
Scorer.score_token_attr_per_feat(
examples, "morph", getter=morph_key_getter, **kwargs
)
)
elif attr == LEMMA:
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the AttributeRuler to a bytestring. """Serialize the AttributeRuler to a bytestring.

View File

@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from collections import defaultdict from collections import defaultdict
from typing import Optional, Iterable from typing import Optional, Iterable, Callable
from thinc.api import Model, Config from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem from ._parser_internals.transition_system import TransitionSystem
@ -12,7 +12,7 @@ from ..language import Language
from ._parser_internals import nonproj from ._parser_internals import nonproj
from ._parser_internals.nonproj import DELIMITER from ._parser_internals.nonproj import DELIMITER
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples from ..util import registry
default_model_config = """ default_model_config = """
@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
"learn_tokens": False, "learn_tokens": False,
"min_action_freq": 30, "min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL, "model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
}, },
default_score_weights={ default_score_weights={
"dep_uas": 0.5, "dep_uas": 0.5,
@ -63,7 +64,8 @@ def make_parser(
moves: Optional[TransitionSystem], moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int, update_with_oracle_cut_size: int,
learn_tokens: bool, learn_tokens: bool,
min_action_freq: int min_action_freq: int,
scorer: Optional[Callable],
): ):
"""Create a transition-based DependencyParser component. The dependency parser """Create a transition-based DependencyParser component. The dependency parser
jointly learns sentence segmentation and labelled dependency parsing, and can jointly learns sentence segmentation and labelled dependency parsing, and can
@ -115,7 +117,8 @@ def make_parser(
beam_update_prob=0.0, beam_update_prob=0.0,
# At some point in the future we can try to implement support for # At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective. # partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None incorrect_spans_key=None,
scorer=scorer,
) )
@Language.factory( @Language.factory(
@ -130,6 +133,7 @@ def make_parser(
"learn_tokens": False, "learn_tokens": False,
"min_action_freq": 30, "min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL, "model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
}, },
default_score_weights={ default_score_weights={
"dep_uas": 0.5, "dep_uas": 0.5,
@ -151,6 +155,7 @@ def make_beam_parser(
beam_width: int, beam_width: int,
beam_density: float, beam_density: float,
beam_update_prob: float, beam_update_prob: float,
scorer: Optional[Callable],
): ):
"""Create a transition-based DependencyParser component that uses beam-search. """Create a transition-based DependencyParser component that uses beam-search.
The dependency parser jointly learns sentence segmentation and labelled The dependency parser jointly learns sentence segmentation and labelled
@ -207,10 +212,41 @@ def make_beam_parser(
min_action_freq=min_action_freq, min_action_freq=min_action_freq,
# At some point in the future we can try to implement support for # At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective. # partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None incorrect_spans_key=None,
scorer=scorer,
) )
def parser_score(examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
and Scorer.score_deps.
DOCS: https://spacy.io/api/dependencyparser#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
def dep_getter(token, attr):
dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower()
return dep
results = {}
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs))
del results["sents_per_type"]
return results
@registry.scorers("spacy.parser_scorer.v1")
def make_parser_scorer():
return parser_score
cdef class DependencyParser(Parser): cdef class DependencyParser(Parser):
"""Pipeline component for dependency parsing. """Pipeline component for dependency parsing.
@ -233,6 +269,7 @@ cdef class DependencyParser(Parser):
beam_update_prob=0.0, beam_update_prob=0.0,
multitasks=tuple(), multitasks=tuple(),
incorrect_spans_key=None, incorrect_spans_key=None,
scorer=parser_score,
): ):
"""Create a DependencyParser. """Create a DependencyParser.
""" """
@ -249,6 +286,7 @@ cdef class DependencyParser(Parser):
beam_update_prob=beam_update_prob, beam_update_prob=beam_update_prob,
multitasks=multitasks, multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key, incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
) )
@property @property
@ -281,31 +319,6 @@ cdef class DependencyParser(Parser):
labels.add(label) labels.add(label)
return tuple(sorted(labels)) return tuple(sorted(labels))
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
and Scorer.score_deps.
DOCS: https://spacy.io/api/dependencyparser#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "DependencyParser.score")
def dep_getter(token, attr):
dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower()
return dep
results = {}
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs))
del results["sents_per_type"]
return results
def scored_parses(self, beams): def scored_parses(self, beams):
"""Return two dictionaries with scores for each beam/doc that was processed: """Return two dictionaries with scores for each beam/doc that was processed:
one containing (i, head) keys, and another containing (i, label) keys. one containing (i, head) keys, and another containing (i, label) keys.

View File

@ -16,7 +16,7 @@ from ..language import Language
from ..vocab import Vocab from ..vocab import Vocab
from ..training import Example, validate_examples, validate_get_examples from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import SimpleFrozenList from ..util import SimpleFrozenList, registry
from .. import util from .. import util
from ..scorer import Scorer from ..scorer import Scorer
@ -50,6 +50,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"incl_context": True, "incl_context": True,
"entity_vector_length": 64, "entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
}, },
default_score_weights={ default_score_weights={
"nel_micro_f": 1.0, "nel_micro_f": 1.0,
@ -68,6 +69,7 @@ def make_entity_linker(
incl_context: bool, incl_context: bool,
entity_vector_length: int, entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
scorer: Optional[Callable],
): ):
"""Construct an EntityLinker component. """Construct an EntityLinker component.
@ -92,9 +94,19 @@ def make_entity_linker(
incl_context=incl_context, incl_context=incl_context,
entity_vector_length=entity_vector_length, entity_vector_length=entity_vector_length,
get_candidates=get_candidates, get_candidates=get_candidates,
scorer=scorer,
) )
def entity_linker_score(examples, **kwargs):
return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
@registry.scorers("spacy.entity_linker_scorer.v1")
def make_entity_linker_scorer():
return entity_linker_score
class EntityLinker(TrainablePipe): class EntityLinker(TrainablePipe):
"""Pipeline component for named entity linking. """Pipeline component for named entity linking.
@ -115,6 +127,7 @@ class EntityLinker(TrainablePipe):
incl_context: bool, incl_context: bool,
entity_vector_length: int, entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
scorer: Optional[Callable] = entity_linker_score,
) -> None: ) -> None:
"""Initialize an entity linker. """Initialize an entity linker.
@ -145,6 +158,7 @@ class EntityLinker(TrainablePipe):
# how many neighbour sentences to take into account # how many neighbour sentences to take into account
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
self.kb = empty_kb(entity_vector_length)(self.vocab) self.kb = empty_kb(entity_vector_length)(self.vocab)
self.scorer = scorer
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will """Define the KB of this pipe by providing a function that will
@ -389,17 +403,6 @@ class EntityLinker(TrainablePipe):
for token in ent: for token in ent:
token.ent_kb_id_ = kb_id token.ent_kb_id_ = kb_id
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS TODO: https://spacy.io/api/entity_linker#score
"""
validate_examples(examples, "EntityLinker.score")
return Scorer.score_links(examples, negative_labels=[self.NIL])
def to_bytes(self, *, exclude=tuple()): def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring. """Serialize the pipe to a bytestring.

View File

@ -8,11 +8,10 @@ from .pipe import Pipe
from ..training import Example from ..training import Example
from ..language import Language from ..language import Language
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from ..scorer import get_ner_prf from ..scorer import get_ner_prf
from ..training import validate_examples
DEFAULT_ENT_ID_SEP = "||" DEFAULT_ENT_ID_SEP = "||"
@ -27,6 +26,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
"validate": False, "validate": False,
"overwrite_ents": False, "overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP, "ent_id_sep": DEFAULT_ENT_ID_SEP,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
}, },
default_score_weights={ default_score_weights={
"ents_f": 1.0, "ents_f": 1.0,
@ -42,6 +42,7 @@ def make_entity_ruler(
validate: bool, validate: bool,
overwrite_ents: bool, overwrite_ents: bool,
ent_id_sep: str, ent_id_sep: str,
scorer: Optional[Callable],
): ):
return EntityRuler( return EntityRuler(
nlp, nlp,
@ -50,9 +51,19 @@ def make_entity_ruler(
validate=validate, validate=validate,
overwrite_ents=overwrite_ents, overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep, ent_id_sep=ent_id_sep,
scorer=scorer,
) )
def entity_ruler_score(examples, **kwargs):
return get_ner_prf(examples)
@registry.scorers("spacy.entity_ruler_scorer.v1")
def make_entity_ruler_scorer():
return entity_ruler_score
class EntityRuler(Pipe): class EntityRuler(Pipe):
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based """The EntityRuler lets you add spans to the `Doc.ents` using token-based
rules or exact phrase matches. It can be combined with the statistical rules or exact phrase matches. It can be combined with the statistical
@ -74,6 +85,7 @@ class EntityRuler(Pipe):
overwrite_ents: bool = False, overwrite_ents: bool = False,
ent_id_sep: str = DEFAULT_ENT_ID_SEP, ent_id_sep: str = DEFAULT_ENT_ID_SEP,
patterns: Optional[List[PatternType]] = None, patterns: Optional[List[PatternType]] = None,
scorer: Optional[Callable] = entity_ruler_score,
) -> None: ) -> None:
"""Initialize the entity ruler. If patterns are supplied here, they """Initialize the entity ruler. If patterns are supplied here, they
need to be a list of dictionaries with a `"label"` and `"pattern"` need to be a list of dictionaries with a `"label"` and `"pattern"`
@ -112,6 +124,7 @@ class EntityRuler(Pipe):
self._ent_ids = defaultdict(dict) self._ent_ids = defaultdict(dict)
if patterns is not None: if patterns is not None:
self.add_patterns(patterns) self.add_patterns(patterns)
self.scorer = scorer
def __len__(self) -> int: def __len__(self) -> int:
"""The number of all patterns added to the entity ruler.""" """The number of all patterns added to the entity ruler."""
@ -358,10 +371,6 @@ class EntityRuler(Pipe):
label = f"{label}{self.ent_id_sep}{ent_id}" label = f"{label}{self.ent_id_sep}{ent_id}"
return label return label
def score(self, examples, **kwargs):
validate_examples(examples, "EntityRuler.score")
return get_ner_prf(examples)
def from_bytes( def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler": ) -> "EntityRuler":

View File

@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc, Token from ..tokens import Doc, Token
from ..vocab import Vocab from ..vocab import Vocab
from ..training import validate_examples from ..util import logger, SimpleFrozenList, registry
from ..util import logger, SimpleFrozenList
from .. import util from .. import util
@Language.factory( @Language.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "lookup", "overwrite": False}, default_config={
"model": None,
"mode": "lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_token_attr(examples, "lemma", **kwargs)
@registry.scorers("spacy.lemmatizer_scorer.v1")
def make_lemmatizer_scorer():
return lemmatizer_score
class Lemmatizer(Pipe): class Lemmatizer(Pipe):
@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
*, *,
mode: str = "lookup", mode: str = "lookup",
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
"""Initialize a Lemmatizer. """Initialize a Lemmatizer.
@ -89,6 +110,7 @@ class Lemmatizer(Pipe):
raise ValueError(Errors.E1003.format(mode=mode)) raise ValueError(Errors.E1003.format(mode=mode))
self.lemmatize = getattr(self, mode_attr) self.lemmatize = getattr(self, mode_attr)
self.cache = {} self.cache = {}
self.scorer = scorer
@property @property
def mode(self): def mode(self):
@ -247,17 +269,6 @@ class Lemmatizer(Pipe):
""" """
return False return False
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS: https://spacy.io/api/lemmatizer#score
"""
validate_examples(examples, "Lemmatizer.score")
return Scorer.score_token_attr(examples, "lemma", **kwargs)
def to_disk( def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
): ):

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import Optional, Union, Dict from typing import Optional, Union, Dict, Callable
import srsly import srsly
from thinc.api import SequenceCategoricalCrossentropy, Model, Config from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from itertools import islice from itertools import islice
@ -17,6 +17,7 @@ from .tagger import Tagger
from .. import util from .. import util
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples from ..training import validate_examples, validate_get_examples
from ..util import registry
default_model_config = """ default_model_config = """
@ -48,15 +49,33 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory( @Language.factory(
"morphologizer", "morphologizer",
assigns=["token.morph", "token.pos"], assigns=["token.morph", "token.pos"],
default_config={"model": DEFAULT_MORPH_MODEL}, default_config={"model": DEFAULT_MORPH_MODEL, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
) )
def make_morphologizer( def make_morphologizer(
nlp: Language, nlp: Language,
model: Model, model: Model,
name: str, name: str,
scorer: Optional[Callable],
): ):
return Morphologizer(nlp.vocab, model, name) return Morphologizer(nlp.vocab, model, name, scorer=scorer)
def morphologizer_score(examples, **kwargs):
def morph_key_getter(token, attr):
return getattr(token, attr).key
results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples,
"morph", getter=morph_key_getter, **kwargs))
return results
@registry.scorers("spacy.morphologizer_scorer.v1")
def make_morphologizer_scorer():
return morphologizer_score
class Morphologizer(Tagger): class Morphologizer(Tagger):
@ -67,6 +86,8 @@ class Morphologizer(Tagger):
vocab: Vocab, vocab: Vocab,
model: Model, model: Model,
name: str = "morphologizer", name: str = "morphologizer",
*,
scorer: Optional[Callable] = morphologizer_score,
): ):
"""Initialize a morphologizer. """Initialize a morphologizer.
@ -87,6 +108,7 @@ class Morphologizer(Tagger):
# 2) labels_pos stores a mapping from morph+POS->POS # 2) labels_pos stores a mapping from morph+POS->POS
cfg = {"labels_morph": {}, "labels_pos": {}} cfg = {"labels_morph": {}, "labels_pos": {}}
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
@property @property
def labels(self): def labels(self):
@ -246,24 +268,3 @@ class Morphologizer(Tagger):
if self.model.ops.xp.isnan(loss): if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name)) raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores return float(loss), d_scores
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by
Scorer.score_token_attr for the attributes "pos" and "morph" and
Scorer.score_token_attr_per_feat for the attribute "morph".
DOCS: https://spacy.io/api/morphologizer#score
"""
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "Morphologizer.score")
results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples,
"morph", getter=morph_key_getter, **kwargs))
return results

View File

@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from collections import defaultdict from collections import defaultdict
from typing import Optional, Iterable from typing import Optional, Iterable, Callable
from thinc.api import Model, Config from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem from ._parser_internals.transition_system import TransitionSystem
@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown
from ..language import Language from ..language import Language
from ..scorer import get_ner_prf, PRFScore from ..scorer import get_ner_prf, PRFScore
from ..training import validate_examples from ..util import registry
default_model_config = """ default_model_config = """
@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"moves": None, "moves": None,
"update_with_oracle_cut_size": 100, "update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL, "model": DEFAULT_NER_MODEL,
"incorrect_spans_key": None "incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
}, },
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
@ -52,7 +53,8 @@ def make_ner(
model: Model, model: Model,
moves: Optional[TransitionSystem], moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int, update_with_oracle_cut_size: int,
incorrect_spans_key: Optional[str]=None incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
): ):
"""Create a transition-based EntityRecognizer component. The entity recognizer """Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens. identifies non-overlapping labelled spans of tokens.
@ -92,6 +94,7 @@ def make_ner(
beam_width=1, beam_width=1,
beam_density=0.0, beam_density=0.0,
beam_update_prob=0.0, beam_update_prob=0.0,
scorer=scorer,
) )
@Language.factory( @Language.factory(
@ -104,7 +107,8 @@ def make_ner(
"beam_density": 0.01, "beam_density": 0.01,
"beam_update_prob": 0.5, "beam_update_prob": 0.5,
"beam_width": 32, "beam_width": 32,
"incorrect_spans_key": None "incorrect_spans_key": None,
"scorer": None,
}, },
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
) )
@ -117,7 +121,8 @@ def make_beam_ner(
beam_width: int, beam_width: int,
beam_density: float, beam_density: float,
beam_update_prob: float, beam_update_prob: float,
incorrect_spans_key: Optional[str]=None incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
): ):
"""Create a transition-based EntityRecognizer component that uses beam-search. """Create a transition-based EntityRecognizer component that uses beam-search.
The entity recognizer identifies non-overlapping labelled spans of tokens. The entity recognizer identifies non-overlapping labelled spans of tokens.
@ -164,10 +169,20 @@ def make_beam_ner(
beam_width=beam_width, beam_width=beam_width,
beam_density=beam_density, beam_density=beam_density,
beam_update_prob=beam_update_prob, beam_update_prob=beam_update_prob,
incorrect_spans_key=incorrect_spans_key incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
) )
def ner_score(examples, **kwargs):
return get_ner_prf(examples, **kwargs)
@registry.scorers("spacy.ner_scorer.v1")
def make_ner_scorer():
return ner_score
cdef class EntityRecognizer(Parser): cdef class EntityRecognizer(Parser):
"""Pipeline component for named entity recognition. """Pipeline component for named entity recognition.
@ -188,6 +203,7 @@ cdef class EntityRecognizer(Parser):
beam_update_prob=0.0, beam_update_prob=0.0,
multitasks=tuple(), multitasks=tuple(),
incorrect_spans_key=None, incorrect_spans_key=None,
scorer=ner_score,
): ):
"""Create an EntityRecognizer. """Create an EntityRecognizer.
""" """
@ -204,6 +220,7 @@ cdef class EntityRecognizer(Parser):
beam_update_prob=beam_update_prob, beam_update_prob=beam_update_prob,
multitasks=multitasks, multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key, incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
) )
def add_multitask_objective(self, mt_component): def add_multitask_objective(self, mt_component):
@ -227,17 +244,6 @@ cdef class EntityRecognizer(Parser):
if move[0] in ("B", "I", "L", "U")) if move[0] in ("B", "I", "L", "U"))
return tuple(sorted(labels)) return tuple(sorted(labels))
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
DOCS: https://spacy.io/api/entityrecognizer#score
"""
validate_examples(examples, "EntityRecognizer.score")
return get_ner_prf(examples)
def scored_ents(self, beams): def scored_ents(self, beams):
"""Return a dictionary of (start, end, label) tuples with corresponding scores """Return a dictionary of (start, end, label) tuples with corresponding scores
for each beam/doc that was processed. for each beam/doc that was processed.

View File

@ -81,6 +81,17 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe#score DOCS: https://spacy.io/api/pipe#score
""" """
if hasattr(self, "scorer") and self.scorer is not None:
scorer_kwargs = {}
# use default settings from cfg (e.g., threshold)
if hasattr(self, "cfg") and isinstance(self.cfg, dict):
scorer_kwargs.update(self.cfg)
# override self.cfg["labels"] with self.labels
if hasattr(self, "labels"):
scorer_kwargs["labels"] = self.labels
# override with kwargs settings
scorer_kwargs.update(kwargs)
return self.scorer(examples, **scorer_kwargs)
return {} return {}
@property @property

View File

@ -1,26 +1,29 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import Optional, List from typing import Optional, List, Callable
import srsly import srsly
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from .pipe import Pipe from .pipe import Pipe
from .senter import senter_score
from ..language import Language from ..language import Language
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples
from .. import util from .. import util
@Language.factory( @Language.factory(
"sentencizer", "sentencizer",
assigns=["token.is_sent_start", "doc.sents"], assigns=["token.is_sent_start", "doc.sents"],
default_config={"punct_chars": None}, default_config={"punct_chars": None, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
) )
def make_sentencizer( def make_sentencizer(
nlp: Language, nlp: Language,
name: str, name: str,
punct_chars: Optional[List[str]] punct_chars: Optional[List[str]],
scorer: Optional[Callable],
): ):
return Sentencizer(name, punct_chars=punct_chars) return Sentencizer(name, punct_chars=punct_chars, scorer=scorer)
class Sentencizer(Pipe): class Sentencizer(Pipe):
@ -41,7 +44,13 @@ class Sentencizer(Pipe):
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
'', ''] '', '']
def __init__(self, name="sentencizer", *, punct_chars=None): def __init__(
self,
name="sentencizer",
*,
punct_chars=None,
scorer=senter_score,
):
"""Initialize the sentencizer. """Initialize the sentencizer.
punct_chars (list): Punctuation characters to split on. Will be punct_chars (list): Punctuation characters to split on. Will be
@ -55,6 +64,7 @@ class Sentencizer(Pipe):
self.punct_chars = set(punct_chars) self.punct_chars = set(punct_chars)
else: else:
self.punct_chars = set(self.default_punct_chars) self.punct_chars = set(self.default_punct_chars)
self.scorer = scorer
def __call__(self, doc): def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start. """Apply the sentencizer to a Doc and set Token.is_sent_start.
@ -122,22 +132,6 @@ class Sentencizer(Pipe):
else: else:
doc.c[j].sent_start = -1 doc.c[j].sent_start = -1
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
DOCS: https://spacy.io/api/sentencizer#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "Sentencizer.score")
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results
def to_bytes(self, *, exclude=tuple()): def to_bytes(self, *, exclude=tuple()):
"""Serialize the sentencizer to a bytestring. """Serialize the sentencizer to a bytestring.

View File

@ -1,5 +1,6 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from itertools import islice from itertools import islice
from typing import Optional, Callable
import srsly import srsly
from thinc.api import Model, SequenceCategoricalCrossentropy, Config from thinc.api import Model, SequenceCategoricalCrossentropy, Config
@ -11,6 +12,7 @@ from ..language import Language
from ..errors import Errors from ..errors import Errors
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples from ..training import validate_examples, validate_get_examples
from ..util import registry
from .. import util from .. import util
@ -34,11 +36,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory( @Language.factory(
"senter", "senter",
assigns=["token.is_sent_start"], assigns=["token.is_sent_start"],
default_config={"model": DEFAULT_SENTER_MODEL}, default_config={"model": DEFAULT_SENTER_MODEL, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
) )
def make_senter(nlp: Language, name: str, model: Model): def make_senter(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
return SentenceRecognizer(nlp.vocab, model, name) return SentenceRecognizer(nlp.vocab, model, name, scorer=scorer)
def senter_score(examples, **kwargs):
def has_sents(doc):
return doc.has_annotation("SENT_START")
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results
@registry.scorers("spacy.senter_scorer.v1")
def make_senter_scorer():
return senter_score
class SentenceRecognizer(Tagger): class SentenceRecognizer(Tagger):
@ -46,7 +62,7 @@ class SentenceRecognizer(Tagger):
DOCS: https://spacy.io/api/sentencerecognizer DOCS: https://spacy.io/api/sentencerecognizer
""" """
def __init__(self, vocab, model, name="senter"): def __init__(self, vocab, model, name="senter", *, scorer=senter_score):
"""Initialize a sentence recognizer. """Initialize a sentence recognizer.
vocab (Vocab): The shared vocabulary. vocab (Vocab): The shared vocabulary.
@ -61,6 +77,7 @@ class SentenceRecognizer(Tagger):
self.name = name self.name = name
self._rehearsal_model = None self._rehearsal_model = None
self.cfg = {} self.cfg = {}
self.scorer = scorer
@property @property
def labels(self): def labels(self):
@ -153,18 +170,3 @@ class SentenceRecognizer(Tagger):
def add_label(self, label, values=None): def add_label(self, label, values=None):
raise NotImplementedError raise NotImplementedError
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
DOCS: https://spacy.io/api/sentencerecognizer#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "SentenceRecognizer.score")
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results

View File

@ -98,6 +98,7 @@ def build_ngram_range_suggester(
"max_positive": None, "max_positive": None,
"model": DEFAULT_SPANCAT_MODEL, "model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
}, },
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
) )
@ -107,8 +108,9 @@ def make_spancat(
suggester: Callable[[List[Doc]], Ragged], suggester: Callable[[List[Doc]], Ragged],
model: Model[Tuple[List[Doc], Ragged], Floats2d], model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str, spans_key: str,
threshold: float = 0.5, scorer: Optional[Callable],
max_positive: Optional[int] = None, threshold: float,
max_positive: Optional[int],
) -> "SpanCategorizer": ) -> "SpanCategorizer":
"""Create a SpanCategorizer component. The span categorizer consists of two """Create a SpanCategorizer component. The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller parts: a suggester function that proposes candidate spans, and a labeller
@ -138,9 +140,28 @@ def make_spancat(
threshold=threshold, threshold=threshold,
max_positive=max_positive, max_positive=max_positive,
name=name, name=name,
scorer=scorer,
) )
def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
kwargs = dict(kwargs)
attr_prefix = "spans_"
key = kwargs["spans_key"]
kwargs.setdefault("attr", f"{attr_prefix}{key}")
kwargs.setdefault("allow_overlap", True)
kwargs.setdefault(
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
)
kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
return Scorer.score_spans(examples, **kwargs)
@registry.scorers("spacy.spancat_scorer.v1")
def make_spancat_scorer():
return spancat_score
class SpanCategorizer(TrainablePipe): class SpanCategorizer(TrainablePipe):
"""Pipeline component to label spans of text. """Pipeline component to label spans of text.
@ -157,6 +178,7 @@ class SpanCategorizer(TrainablePipe):
spans_key: str = "spans", spans_key: str = "spans",
threshold: float = 0.5, threshold: float = 0.5,
max_positive: Optional[int] = None, max_positive: Optional[int] = None,
scorer: Optional[Callable] = spancat_score,
) -> None: ) -> None:
"""Initialize the span categorizer. """Initialize the span categorizer.
@ -172,6 +194,7 @@ class SpanCategorizer(TrainablePipe):
self.suggester = suggester self.suggester = suggester
self.model = model self.model = model
self.name = name self.name = name
self.scorer = scorer
@property @property
def key(self) -> str: def key(self) -> str:
@ -373,28 +396,6 @@ class SpanCategorizer(TrainablePipe):
else: else:
self.model.initialize() self.model.initialize()
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
DOCS: https://spacy.io/api/spancategorizer#score
"""
validate_examples(examples, "SpanCategorizer.score")
self._validate_categories(examples)
kwargs = dict(kwargs)
attr_prefix = "spans_"
kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
kwargs.setdefault("labels", self.labels)
kwargs.setdefault("multi_label", True)
kwargs.setdefault("threshold", self.cfg["threshold"])
kwargs.setdefault(
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
)
kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
return Scorer.score_spans(examples, **kwargs)
def _validate_categories(self, examples): def _validate_categories(self, examples):
# TODO # TODO
pass pass

View File

@ -1,4 +1,5 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import Callable, Optional
import numpy import numpy
import srsly import srsly
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
@ -18,6 +19,7 @@ from ..parts_of_speech import X
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples from ..training import validate_examples, validate_get_examples
from ..util import registry
from .. import util from .. import util
@ -41,10 +43,10 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory( @Language.factory(
"tagger", "tagger",
assigns=["token.tag"], assigns=["token.tag"],
default_config={"model": DEFAULT_TAGGER_MODEL}, default_config={"model": DEFAULT_TAGGER_MODEL, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
default_score_weights={"tag_acc": 1.0}, default_score_weights={"tag_acc": 1.0},
) )
def make_tagger(nlp: Language, name: str, model: Model): def make_tagger(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
"""Construct a part-of-speech tagger component. """Construct a part-of-speech tagger component.
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
@ -52,7 +54,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
in size, and be normalized as probabilities (all scores between 0 and 1, in size, and be normalized as probabilities (all scores between 0 and 1,
with the rows summing to 1). with the rows summing to 1).
""" """
return Tagger(nlp.vocab, model, name) return Tagger(nlp.vocab, model, name, scorer=scorer)
def tagger_score(examples, **kwargs):
return Scorer.score_token_attr(examples, "tag", **kwargs)
@registry.scorers("spacy.tagger_scorer.v1")
def make_tagger_scorer():
return tagger_score
class Tagger(TrainablePipe): class Tagger(TrainablePipe):
@ -60,7 +71,7 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger DOCS: https://spacy.io/api/tagger
""" """
def __init__(self, vocab, model, name="tagger"): def __init__(self, vocab, model, name="tagger", *, scorer=tagger_score):
"""Initialize a part-of-speech tagger. """Initialize a part-of-speech tagger.
vocab (Vocab): The shared vocabulary. vocab (Vocab): The shared vocabulary.
@ -76,6 +87,7 @@ class Tagger(TrainablePipe):
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": []} cfg = {"labels": []}
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
@property @property
def labels(self): def labels(self):
@ -289,15 +301,3 @@ class Tagger(TrainablePipe):
self.cfg["labels"].append(label) self.cfg["labels"].append(label)
self.vocab.strings.add(label) self.vocab.strings.add(label)
return 1 return 1
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by
Scorer.score_token_attr for the attributes "tag".
DOCS: https://spacy.io/api/tagger#score
"""
validate_examples(examples, "Tagger.score")
return Scorer.score_token_attr(examples, "tag", **kwargs)

View File

@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors from ..errors import Errors
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc from ..tokens import Doc
from ..util import registry
from ..vocab import Vocab from ..vocab import Vocab
@ -70,7 +71,11 @@ subword_features = true
@Language.factory( @Language.factory(
"textcat", "textcat",
assigns=["doc.cats"], assigns=["doc.cats"],
default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL}, default_config={
"threshold": 0.5,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
},
default_score_weights={ default_score_weights={
"cats_score": 1.0, "cats_score": 1.0,
"cats_score_desc": None, "cats_score_desc": None,
@ -86,7 +91,11 @@ subword_features = true
}, },
) )
def make_textcat( def make_textcat(
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> "TextCategorizer": ) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories """Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered over a whole document. It can learn one or more labels, and the labels are considered
@ -96,7 +105,21 @@ def make_textcat(
scores for each category. scores for each category.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
""" """
return TextCategorizer(nlp.vocab, model, name, threshold=threshold) return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_cats(
examples,
"cats",
multi_label=False,
**kwargs,
)
@registry.scorers("spacy.textcat_scorer.v1")
def make_textcat_scorer():
return textcat_score
class TextCategorizer(TrainablePipe): class TextCategorizer(TrainablePipe):
@ -106,7 +129,13 @@ class TextCategorizer(TrainablePipe):
""" """
def __init__( def __init__(
self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float self,
vocab: Vocab,
model: Model,
name: str = "textcat",
*,
threshold: float,
scorer: Optional[Callable] = textcat_score,
) -> None: ) -> None:
"""Initialize a text categorizer for single-label classification. """Initialize a text categorizer for single-label classification.
@ -124,6 +153,7 @@ class TextCategorizer(TrainablePipe):
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold, "positive_label": None} cfg = {"labels": [], "threshold": threshold, "positive_label": None}
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.scorer = scorer
@property @property
def labels(self) -> Tuple[str]: def labels(self) -> Tuple[str]:
@ -354,26 +384,6 @@ class TextCategorizer(TrainablePipe):
assert len(label_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample) self.model.initialize(X=doc_sample, Y=label_sample)
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
DOCS: https://spacy.io/api/textcategorizer#score
"""
validate_examples(examples, "TextCategorizer.score")
self._validate_categories(examples)
kwargs.setdefault("threshold", self.cfg["threshold"])
kwargs.setdefault("positive_label", self.cfg["positive_label"])
return Scorer.score_cats(
examples,
"cats",
labels=self.labels,
multi_label=False,
**kwargs,
)
def _validate_categories(self, examples: List[Example]): def _validate_categories(self, examples: List[Example]):
"""Check whether the provided examples all have single-label cats annotations.""" """Check whether the provided examples all have single-label cats annotations."""
for ex in examples: for ex in examples:

View File

@ -5,10 +5,11 @@ from thinc.api import Model, Config
from thinc.types import Floats2d from thinc.types import Floats2d
from ..language import Language from ..language import Language
from ..training import Example, validate_examples, validate_get_examples from ..training import Example, validate_get_examples
from ..errors import Errors from ..errors import Errors
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc from ..tokens import Doc
from ..util import registry
from ..vocab import Vocab from ..vocab import Vocab
from .textcat import TextCategorizer from .textcat import TextCategorizer
@ -70,7 +71,11 @@ subword_features = true
@Language.factory( @Language.factory(
"textcat_multilabel", "textcat_multilabel",
assigns=["doc.cats"], assigns=["doc.cats"],
default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL}, default_config={
"threshold": 0.5,
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
},
default_score_weights={ default_score_weights={
"cats_score": 1.0, "cats_score": 1.0,
"cats_score_desc": None, "cats_score_desc": None,
@ -86,7 +91,11 @@ subword_features = true
}, },
) )
def make_multilabel_textcat( def make_multilabel_textcat(
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> "TextCategorizer": ) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories """Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered over a whole document. It can learn one or more labels, and the labels are considered
@ -97,7 +106,23 @@ def make_multilabel_textcat(
scores for each category. scores for each category.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
""" """
return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold) return MultiLabel_TextCategorizer(
nlp.vocab, model, name, threshold=threshold, scorer=scorer
)
def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_cats(
examples,
"cats",
multi_label=True,
**kwargs,
)
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
def make_textcat_multilabel_scorer():
return textcat_multilabel_score
class MultiLabel_TextCategorizer(TextCategorizer): class MultiLabel_TextCategorizer(TextCategorizer):
@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
name: str = "textcat_multilabel", name: str = "textcat_multilabel",
*, *,
threshold: float, threshold: float,
scorer: Optional[Callable] = textcat_multilabel_score,
) -> None: ) -> None:
"""Initialize a text categorizer for multi-label classification. """Initialize a text categorizer for multi-label classification.
@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold} cfg = {"labels": [], "threshold": threshold}
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.scorer = scorer
def initialize( def initialize(
self, self,
@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer):
assert len(label_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample) self.model.initialize(X=doc_sample, Y=label_sample)
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
DOCS: https://spacy.io/api/textcategorizer#score
"""
validate_examples(examples, "MultiLabel_TextCategorizer.score")
kwargs.setdefault("threshold", self.cfg["threshold"])
return Scorer.score_cats(
examples,
"cats",
labels=self.labels,
multi_label=True,
**kwargs,
)
def _validate_categories(self, examples: List[Example]): def _validate_categories(self, examples: List[Example]):
"""This component allows any type of single- or multi-label annotations. """This component allows any type of single- or multi-label annotations.
This method overwrites the more strict one from 'textcat'.""" This method overwrites the more strict one from 'textcat'."""

View File

@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
cdef public Vocab vocab cdef public Vocab vocab
cdef public object model cdef public object model
cdef public object cfg cdef public object cfg
cdef public object scorer

View File

@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
beam_density=0.0, beam_density=0.0,
beam_update_prob=0.0, beam_update_prob=0.0,
multitasks=tuple(), multitasks=tuple(),
incorrect_spans_key=None incorrect_spans_key=None,
scorer=None,
): ):
"""Create a Parser. """Create a Parser.
@ -117,6 +118,7 @@ cdef class Parser(TrainablePipe):
self.add_multitask_objective(multitask) self.add_multitask_objective(multitask)
self._rehearsal_model = None self._rehearsal_model = None
self.scorer = scorer
def __getnewargs_ex__(self): def __getnewargs_ex__(self):
"""This allows pickling the Parser and its keyword-only init arguments""" """This allows pickling the Parser and its keyword-only init arguments"""

View File

@ -537,7 +537,7 @@ class Scorer:
@staticmethod @staticmethod
def score_links( def score_links(
examples: Iterable[Example], *, negative_labels: Iterable[str] examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Returns PRF for predicted links on the entity level. """Returns PRF for predicted links on the entity level.
To disentangle the performance of the NEL from the NER, To disentangle the performance of the NEL from the NER,
@ -711,7 +711,7 @@ class Scorer:
} }
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]: def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples.""" """Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
score_per_type = defaultdict(PRFScore) score_per_type = defaultdict(PRFScore)
for eg in examples: for eg in examples:

View File

@ -32,24 +32,6 @@ def pattern_dicts():
] ]
@registry.misc("attribute_ruler_patterns")
def attribute_ruler_patterns():
return [
{
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
},
# one pattern sets the lemma
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
# another pattern sets the morphology
{
"patterns": [[{"ORTH": "test"}]],
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
"index": 0,
},
]
@pytest.fixture @pytest.fixture
def tag_map(): def tag_map():
return { return {
@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler") nlp.remove_pipe("attribute_ruler")
# initialize with patterns from misc registry # initialize with patterns from misc registry
@registry.misc("attribute_ruler_patterns")
def attribute_ruler_patterns():
return [
{
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
},
# one pattern sets the lemma
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
# another pattern sets the morphology
{
"patterns": [[{"ORTH": "test"}]],
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
"index": 0,
},
]
nlp.config["initialize"]["components"]["attribute_ruler"] = { nlp.config["initialize"]["components"]["attribute_ruler"] = {
"patterns": {"@misc": "attribute_ruler_patterns"} "patterns": {"@misc": "attribute_ruler_patterns"}
} }
@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
assert scores["lemma_acc"] == pytest.approx(0.2) assert scores["lemma_acc"] == pytest.approx(0.2)
# no morphs are set # no morphs are set
assert scores["morph_acc"] is None assert scores["morph_acc"] is None
nlp.remove_pipe("attribute_ruler")
# test with custom scorer
@registry.misc("weird_scorer.v1")
def make_weird_scorer():
def weird_scorer(examples, weird_score, **kwargs):
return {"weird_score": weird_score}
return weird_scorer
ruler = nlp.add_pipe(
"attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
)
ruler.initialize(lambda: [], patterns=pattern_dicts)
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
assert scores["weird_score"] == 0.12345
assert "token_acc" in scores
assert "lemma_acc" not in scores
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
assert scores["weird_score"] == 0.23456
def test_attributeruler_rule_order(nlp): def test_attributeruler_rule_order(nlp):

View File

@ -95,6 +95,7 @@ class registry(thinc.registry):
readers = catalogue.create("spacy", "readers", entry_points=True) readers = catalogue.create("spacy", "readers", entry_points=True)
augmenters = catalogue.create("spacy", "augmenters", entry_points=True) augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
loggers = catalogue.create("spacy", "loggers", entry_points=True) loggers = catalogue.create("spacy", "loggers", entry_points=True)
scorers = catalogue.create("spacy", "scorers", entry_points=True)
# These are factories registered via third-party packages and the # These are factories registered via third-party packages and the
# spacy_factories entry point. This registry only exists so we can easily # spacy_factories entry point. This registry only exists so we can easily
# load them via the entry points. The "true" factories are added via the # load them via the entry points. The "true" factories are added via the