mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Refactor scoring methods to use registered functions (#8766)
* Add scorer option to components Add an optional `scorer` parameter to all pipeline components. If a scoring function is provided, it overrides the default scoring method for that component. * Add registered scorers for all components * Add `scorers` registry * Move all scoring methods outside of components as independent functions and register * Use the registered scoring methods as defaults in configs and inits Additional: * The scoring methods no longer have access to the full component, so use settings from `cfg` as default scorer options to handle settings such as `labels`, `threshold`, and `positive_label` * The `attribute_ruler` scoring method no longer has access to the patterns, so all scoring methods are called * Bug fix: `spancat` scoring method is updated to set `allow_overlap` to score overlapping spans correctly * Update Russian lemmatizer to use direct score method * Check type of cfg in Pipe.score * Fix check * Update spacy/pipeline/sentencizer.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Remove validate_examples from scoring functions * Use Pipe.labels instead of Pipe.cfg["labels"] Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
ee011ca963
commit
f99d6d5e39
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
|
@ -23,13 +23,25 @@ class Bengali(Language):
|
|||
@Bengali.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Bengali"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
|
@ -28,13 +28,25 @@ class Catalan(Language):
|
|||
@Catalan.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return CatalanLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Catalan"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
@ -28,13 +28,25 @@ class Greek(Language):
|
|||
@Greek.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return GreekLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Greek"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
@ -26,13 +26,25 @@ class English(Language):
|
|||
@English.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return EnglishLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["English"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -26,13 +26,25 @@ class Spanish(Language):
|
|||
@Spanish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return SpanishLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Spanish"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -26,13 +26,25 @@ class Persian(Language):
|
|||
@Persian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Persian"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
|
@ -31,13 +31,25 @@ class French(Language):
|
|||
@French.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return FrenchLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["French"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -23,13 +23,25 @@ class Italian(Language):
|
|||
@Italian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pos_lookup",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return ItalianLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Italian"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .lemmatizer import MacedonianLemmatizer
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -38,13 +38,25 @@ class Macedonian(Language):
|
|||
@Macedonian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return MacedonianLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Macedonian"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
|
@ -26,13 +26,25 @@ class Norwegian(Language):
|
|||
@Norwegian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Norwegian"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
|
@ -30,13 +30,25 @@ class Dutch(Language):
|
|||
@Dutch.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return DutchLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Dutch"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
|
@ -33,13 +33,25 @@ class Polish(Language):
|
|||
@Polish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pos_lookup",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return PolishLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Polish"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -22,7 +22,12 @@ class Russian(Language):
|
|||
@Russian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pymorphy2",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
@ -31,8 +36,11 @@ def make_lemmatizer(
|
|||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return RussianLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Russian"]
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
from typing import Optional, List, Dict, Tuple
|
||||
from typing import Optional, List, Dict, Tuple, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...pipeline.lemmatizer import lemmatizer_score
|
||||
from ...symbols import POS
|
||||
from ...tokens import Token
|
||||
from ...vocab import Vocab
|
||||
|
@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
|
|||
*,
|
||||
mode: str = "pymorphy2",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
if mode == "pymorphy2":
|
||||
try:
|
||||
|
@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer):
|
|||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer()
|
||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
|
||||
|
||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -29,13 +29,25 @@ class Swedish(Language):
|
|||
@Swedish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Swedish"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
|
@ -23,13 +23,25 @@ class Ukrainian(Language):
|
|||
@Ukrainian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pymorphy2",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return UkrainianLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Ukrainian"]
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
from typing import Optional
|
||||
from typing import Optional, Callable
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from ..ru.lemmatizer import RussianLemmatizer
|
||||
from ...pipeline.lemmatizer import lemmatizer_score
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
|
@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
*,
|
||||
mode: str = "pymorphy2",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
if mode == "pymorphy2":
|
||||
try:
|
||||
|
@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer(lang="uk")
|
||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
|
||||
|
|
|
@ -5,15 +5,15 @@ from pathlib import Path
|
|||
|
||||
from .pipe import Pipe
|
||||
from ..errors import Errors
|
||||
from ..training import validate_examples, Example
|
||||
from ..training import Example
|
||||
from ..language import Language
|
||||
from ..matcher import Matcher
|
||||
from ..scorer import Scorer
|
||||
from ..symbols import IDS, TAG, POS, MORPH, LEMMA
|
||||
from ..symbols import IDS
|
||||
from ..tokens import Doc, Span
|
||||
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
|
||||
from ..vocab import Vocab
|
||||
from ..util import SimpleFrozenList
|
||||
from ..util import SimpleFrozenList, registry
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -23,9 +23,43 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
|
|||
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
||||
|
||||
|
||||
@Language.factory("attribute_ruler", default_config={"validate": False})
|
||||
def make_attribute_ruler(nlp: Language, name: str, validate: bool):
|
||||
return AttributeRuler(nlp.vocab, name, validate=validate)
|
||||
@Language.factory(
|
||||
"attribute_ruler",
|
||||
default_config={
|
||||
"validate": False,
|
||||
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
||||
},
|
||||
)
|
||||
def make_attribute_ruler(
|
||||
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
|
||||
):
|
||||
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
|
||||
|
||||
|
||||
def attribute_ruler_score(
|
||||
examples: Iterable[Example], **kwargs
|
||||
) -> Dict[str, Any]:
|
||||
def morph_key_getter(token, attr):
|
||||
return getattr(token, attr).key
|
||||
|
||||
results = {}
|
||||
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
results.update(
|
||||
Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
|
||||
)
|
||||
results.update(
|
||||
Scorer.score_token_attr_per_feat(
|
||||
examples, "morph", getter=morph_key_getter, **kwargs
|
||||
)
|
||||
)
|
||||
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||
return results
|
||||
|
||||
|
||||
@registry.scorers("spacy.attribute_ruler_scorer.v1")
|
||||
def make_attribute_ruler_scorer():
|
||||
return attribute_ruler_score
|
||||
|
||||
|
||||
class AttributeRuler(Pipe):
|
||||
|
@ -36,7 +70,12 @@ class AttributeRuler(Pipe):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
|
||||
self,
|
||||
vocab: Vocab,
|
||||
name: str = "attribute_ruler",
|
||||
*,
|
||||
validate: bool = False,
|
||||
scorer: Optional[Callable] = attribute_ruler_score,
|
||||
) -> None:
|
||||
"""Create the AttributeRuler. After creation, you can add patterns
|
||||
with the `.initialize()` or `.add_patterns()` methods, or load patterns
|
||||
|
@ -57,6 +96,7 @@ class AttributeRuler(Pipe):
|
|||
self.attrs = []
|
||||
self._attrs_unnormed = [] # store for reference
|
||||
self.indices = []
|
||||
self.scorer = scorer
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Reset all patterns."""
|
||||
|
@ -228,45 +268,6 @@ class AttributeRuler(Pipe):
|
|||
all_patterns.append(p)
|
||||
return all_patterns
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by
|
||||
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
|
||||
and "lemma" for the target token attributes.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#score
|
||||
"""
|
||||
|
||||
def morph_key_getter(token, attr):
|
||||
return getattr(token, attr).key
|
||||
|
||||
validate_examples(examples, "AttributeRuler.score")
|
||||
results = {}
|
||||
attrs = set()
|
||||
for token_attrs in self.attrs:
|
||||
attrs.update(token_attrs)
|
||||
for attr in attrs:
|
||||
if attr == TAG:
|
||||
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
||||
elif attr == POS:
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
elif attr == MORPH:
|
||||
results.update(
|
||||
Scorer.score_token_attr(
|
||||
examples, "morph", getter=morph_key_getter, **kwargs
|
||||
)
|
||||
)
|
||||
results.update(
|
||||
Scorer.score_token_attr_per_feat(
|
||||
examples, "morph", getter=morph_key_getter, **kwargs
|
||||
)
|
||||
)
|
||||
elif attr == LEMMA:
|
||||
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||
return results
|
||||
|
||||
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||
"""Serialize the AttributeRuler to a bytestring.
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from collections import defaultdict
|
||||
from typing import Optional, Iterable
|
||||
from typing import Optional, Iterable, Callable
|
||||
from thinc.api import Model, Config
|
||||
|
||||
from ._parser_internals.transition_system import TransitionSystem
|
||||
|
@ -12,7 +12,7 @@ from ..language import Language
|
|||
from ._parser_internals import nonproj
|
||||
from ._parser_internals.nonproj import DELIMITER
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples
|
||||
from ..util import registry
|
||||
|
||||
|
||||
default_model_config = """
|
||||
|
@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"model": DEFAULT_PARSER_MODEL,
|
||||
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"dep_uas": 0.5,
|
||||
|
@ -63,7 +64,8 @@ def make_parser(
|
|||
moves: Optional[TransitionSystem],
|
||||
update_with_oracle_cut_size: int,
|
||||
learn_tokens: bool,
|
||||
min_action_freq: int
|
||||
min_action_freq: int,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Create a transition-based DependencyParser component. The dependency parser
|
||||
jointly learns sentence segmentation and labelled dependency parsing, and can
|
||||
|
@ -115,7 +117,8 @@ def make_parser(
|
|||
beam_update_prob=0.0,
|
||||
# At some point in the future we can try to implement support for
|
||||
# partial annotations, perhaps only in the beam objective.
|
||||
incorrect_spans_key=None
|
||||
incorrect_spans_key=None,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
@Language.factory(
|
||||
|
@ -130,6 +133,7 @@ def make_parser(
|
|||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"model": DEFAULT_PARSER_MODEL,
|
||||
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"dep_uas": 0.5,
|
||||
|
@ -151,6 +155,7 @@ def make_beam_parser(
|
|||
beam_width: int,
|
||||
beam_density: float,
|
||||
beam_update_prob: float,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Create a transition-based DependencyParser component that uses beam-search.
|
||||
The dependency parser jointly learns sentence segmentation and labelled
|
||||
|
@ -207,10 +212,41 @@ def make_beam_parser(
|
|||
min_action_freq=min_action_freq,
|
||||
# At some point in the future we can try to implement support for
|
||||
# partial annotations, perhaps only in the beam objective.
|
||||
incorrect_spans_key=None
|
||||
incorrect_spans_key=None,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def parser_score(examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
||||
and Scorer.score_deps.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
def dep_getter(token, attr):
|
||||
dep = getattr(token, attr)
|
||||
dep = token.vocab.strings.as_string(dep).lower()
|
||||
return dep
|
||||
results = {}
|
||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||
kwargs.setdefault("getter", dep_getter)
|
||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
|
||||
@registry.scorers("spacy.parser_scorer.v1")
|
||||
def make_parser_scorer():
|
||||
return parser_score
|
||||
|
||||
|
||||
cdef class DependencyParser(Parser):
|
||||
"""Pipeline component for dependency parsing.
|
||||
|
||||
|
@ -233,6 +269,7 @@ cdef class DependencyParser(Parser):
|
|||
beam_update_prob=0.0,
|
||||
multitasks=tuple(),
|
||||
incorrect_spans_key=None,
|
||||
scorer=parser_score,
|
||||
):
|
||||
"""Create a DependencyParser.
|
||||
"""
|
||||
|
@ -249,6 +286,7 @@ cdef class DependencyParser(Parser):
|
|||
beam_update_prob=beam_update_prob,
|
||||
multitasks=multitasks,
|
||||
incorrect_spans_key=incorrect_spans_key,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
@property
|
||||
|
@ -281,31 +319,6 @@ cdef class DependencyParser(Parser):
|
|||
labels.add(label)
|
||||
return tuple(sorted(labels))
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
||||
and Scorer.score_deps.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
validate_examples(examples, "DependencyParser.score")
|
||||
def dep_getter(token, attr):
|
||||
dep = getattr(token, attr)
|
||||
dep = token.vocab.strings.as_string(dep).lower()
|
||||
return dep
|
||||
results = {}
|
||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||
kwargs.setdefault("getter", dep_getter)
|
||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
def scored_parses(self, beams):
|
||||
"""Return two dictionaries with scores for each beam/doc that was processed:
|
||||
one containing (i, head) keys, and another containing (i, label) keys.
|
||||
|
|
|
@ -16,7 +16,7 @@ from ..language import Language
|
|||
from ..vocab import Vocab
|
||||
from ..training import Example, validate_examples, validate_get_examples
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import SimpleFrozenList
|
||||
from ..util import SimpleFrozenList, registry
|
||||
from .. import util
|
||||
from ..scorer import Scorer
|
||||
|
||||
|
@ -50,6 +50,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"incl_context": True,
|
||||
"entity_vector_length": 64,
|
||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"nel_micro_f": 1.0,
|
||||
|
@ -68,6 +69,7 @@ def make_entity_linker(
|
|||
incl_context: bool,
|
||||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Construct an EntityLinker component.
|
||||
|
||||
|
@ -92,9 +94,19 @@ def make_entity_linker(
|
|||
incl_context=incl_context,
|
||||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def entity_linker_score(examples, **kwargs):
|
||||
return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
|
||||
|
||||
|
||||
@registry.scorers("spacy.entity_linker_scorer.v1")
|
||||
def make_entity_linker_scorer():
|
||||
return entity_linker_score
|
||||
|
||||
|
||||
class EntityLinker(TrainablePipe):
|
||||
"""Pipeline component for named entity linking.
|
||||
|
||||
|
@ -115,6 +127,7 @@ class EntityLinker(TrainablePipe):
|
|||
incl_context: bool,
|
||||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
) -> None:
|
||||
"""Initialize an entity linker.
|
||||
|
||||
|
@ -145,6 +158,7 @@ class EntityLinker(TrainablePipe):
|
|||
# how many neighbour sentences to take into account
|
||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.scorer = scorer
|
||||
|
||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||
"""Define the KB of this pipe by providing a function that will
|
||||
|
@ -389,17 +403,6 @@ class EntityLinker(TrainablePipe):
|
|||
for token in ent:
|
||||
token.ent_kb_id_ = kb_id
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores.
|
||||
|
||||
DOCS TODO: https://spacy.io/api/entity_linker#score
|
||||
"""
|
||||
validate_examples(examples, "EntityLinker.score")
|
||||
return Scorer.score_links(examples, negative_labels=[self.NIL])
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
|
|
|
@ -8,11 +8,10 @@ from .pipe import Pipe
|
|||
from ..training import Example
|
||||
from ..language import Language
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
||||
from ..tokens import Doc, Span
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..scorer import get_ner_prf
|
||||
from ..training import validate_examples
|
||||
|
||||
|
||||
DEFAULT_ENT_ID_SEP = "||"
|
||||
|
@ -27,6 +26,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
|||
"validate": False,
|
||||
"overwrite_ents": False,
|
||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
|
@ -42,6 +42,7 @@ def make_entity_ruler(
|
|||
validate: bool,
|
||||
overwrite_ents: bool,
|
||||
ent_id_sep: str,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return EntityRuler(
|
||||
nlp,
|
||||
|
@ -50,9 +51,19 @@ def make_entity_ruler(
|
|||
validate=validate,
|
||||
overwrite_ents=overwrite_ents,
|
||||
ent_id_sep=ent_id_sep,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def entity_ruler_score(examples, **kwargs):
|
||||
return get_ner_prf(examples)
|
||||
|
||||
|
||||
@registry.scorers("spacy.entity_ruler_scorer.v1")
|
||||
def make_entity_ruler_scorer():
|
||||
return entity_ruler_score
|
||||
|
||||
|
||||
class EntityRuler(Pipe):
|
||||
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
||||
rules or exact phrase matches. It can be combined with the statistical
|
||||
|
@ -74,6 +85,7 @@ class EntityRuler(Pipe):
|
|||
overwrite_ents: bool = False,
|
||||
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
||||
patterns: Optional[List[PatternType]] = None,
|
||||
scorer: Optional[Callable] = entity_ruler_score,
|
||||
) -> None:
|
||||
"""Initialize the entity ruler. If patterns are supplied here, they
|
||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||
|
@ -112,6 +124,7 @@ class EntityRuler(Pipe):
|
|||
self._ent_ids = defaultdict(dict)
|
||||
if patterns is not None:
|
||||
self.add_patterns(patterns)
|
||||
self.scorer = scorer
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""The number of all patterns added to the entity ruler."""
|
||||
|
@ -358,10 +371,6 @@ class EntityRuler(Pipe):
|
|||
label = f"{label}{self.ent_id_sep}{ent_id}"
|
||||
return label
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
validate_examples(examples, "EntityRuler.score")
|
||||
return get_ner_prf(examples)
|
||||
|
||||
def from_bytes(
|
||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "EntityRuler":
|
||||
|
|
|
@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
|
|||
from ..scorer import Scorer
|
||||
from ..tokens import Doc, Token
|
||||
from ..vocab import Vocab
|
||||
from ..training import validate_examples
|
||||
from ..util import logger, SimpleFrozenList
|
||||
from ..util import logger, SimpleFrozenList, registry
|
||||
from .. import util
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "lookup", "overwrite": False},
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "lookup",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||
|
||||
|
||||
@registry.scorers("spacy.lemmatizer_scorer.v1")
|
||||
def make_lemmatizer_scorer():
|
||||
return lemmatizer_score
|
||||
|
||||
|
||||
class Lemmatizer(Pipe):
|
||||
|
@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
|
|||
*,
|
||||
mode: str = "lookup",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
"""Initialize a Lemmatizer.
|
||||
|
||||
|
@ -89,6 +110,7 @@ class Lemmatizer(Pipe):
|
|||
raise ValueError(Errors.E1003.format(mode=mode))
|
||||
self.lemmatize = getattr(self, mode_attr)
|
||||
self.cache = {}
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def mode(self):
|
||||
|
@ -247,17 +269,6 @@ class Lemmatizer(Pipe):
|
|||
"""
|
||||
return False
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores.
|
||||
|
||||
DOCS: https://spacy.io/api/lemmatizer#score
|
||||
"""
|
||||
validate_examples(examples, "Lemmatizer.score")
|
||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
):
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Optional, Union, Dict
|
||||
from typing import Optional, Union, Dict, Callable
|
||||
import srsly
|
||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||
from itertools import islice
|
||||
|
@ -17,6 +17,7 @@ from .tagger import Tagger
|
|||
from .. import util
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..util import registry
|
||||
|
||||
|
||||
default_model_config = """
|
||||
|
@ -48,15 +49,33 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"morphologizer",
|
||||
assigns=["token.morph", "token.pos"],
|
||||
default_config={"model": DEFAULT_MORPH_MODEL},
|
||||
default_config={"model": DEFAULT_MORPH_MODEL, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||
)
|
||||
def make_morphologizer(
|
||||
nlp: Language,
|
||||
model: Model,
|
||||
name: str,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Morphologizer(nlp.vocab, model, name)
|
||||
return Morphologizer(nlp.vocab, model, name, scorer=scorer)
|
||||
|
||||
|
||||
def morphologizer_score(examples, **kwargs):
|
||||
def morph_key_getter(token, attr):
|
||||
return getattr(token, attr).key
|
||||
|
||||
results = {}
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||
"morph", getter=morph_key_getter, **kwargs))
|
||||
return results
|
||||
|
||||
|
||||
@registry.scorers("spacy.morphologizer_scorer.v1")
|
||||
def make_morphologizer_scorer():
|
||||
return morphologizer_score
|
||||
|
||||
|
||||
class Morphologizer(Tagger):
|
||||
|
@ -67,6 +86,8 @@ class Morphologizer(Tagger):
|
|||
vocab: Vocab,
|
||||
model: Model,
|
||||
name: str = "morphologizer",
|
||||
*,
|
||||
scorer: Optional[Callable] = morphologizer_score,
|
||||
):
|
||||
"""Initialize a morphologizer.
|
||||
|
||||
|
@ -87,6 +108,7 @@ class Morphologizer(Tagger):
|
|||
# 2) labels_pos stores a mapping from morph+POS->POS
|
||||
cfg = {"labels_morph": {}, "labels_pos": {}}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -246,24 +268,3 @@ class Morphologizer(Tagger):
|
|||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError(Errors.E910.format(name=self.name))
|
||||
return float(loss), d_scores
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by
|
||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#score
|
||||
"""
|
||||
def morph_key_getter(token, attr):
|
||||
return getattr(token, attr).key
|
||||
|
||||
validate_examples(examples, "Morphologizer.score")
|
||||
results = {}
|
||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||
"morph", getter=morph_key_getter, **kwargs))
|
||||
return results
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from collections import defaultdict
|
||||
from typing import Optional, Iterable
|
||||
from typing import Optional, Iterable, Callable
|
||||
from thinc.api import Model, Config
|
||||
|
||||
from ._parser_internals.transition_system import TransitionSystem
|
||||
|
@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown
|
|||
|
||||
from ..language import Language
|
||||
from ..scorer import get_ner_prf, PRFScore
|
||||
from ..training import validate_examples
|
||||
from ..util import registry
|
||||
|
||||
|
||||
default_model_config = """
|
||||
|
@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"moves": None,
|
||||
"update_with_oracle_cut_size": 100,
|
||||
"model": DEFAULT_NER_MODEL,
|
||||
"incorrect_spans_key": None
|
||||
"incorrect_spans_key": None,
|
||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
|
||||
|
@ -52,7 +53,8 @@ def make_ner(
|
|||
model: Model,
|
||||
moves: Optional[TransitionSystem],
|
||||
update_with_oracle_cut_size: int,
|
||||
incorrect_spans_key: Optional[str]=None
|
||||
incorrect_spans_key: Optional[str],
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
||||
identifies non-overlapping labelled spans of tokens.
|
||||
|
@ -92,6 +94,7 @@ def make_ner(
|
|||
beam_width=1,
|
||||
beam_density=0.0,
|
||||
beam_update_prob=0.0,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
@Language.factory(
|
||||
|
@ -104,7 +107,8 @@ def make_ner(
|
|||
"beam_density": 0.01,
|
||||
"beam_update_prob": 0.5,
|
||||
"beam_width": 32,
|
||||
"incorrect_spans_key": None
|
||||
"incorrect_spans_key": None,
|
||||
"scorer": None,
|
||||
},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
)
|
||||
|
@ -117,7 +121,8 @@ def make_beam_ner(
|
|||
beam_width: int,
|
||||
beam_density: float,
|
||||
beam_update_prob: float,
|
||||
incorrect_spans_key: Optional[str]=None
|
||||
incorrect_spans_key: Optional[str],
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Create a transition-based EntityRecognizer component that uses beam-search.
|
||||
The entity recognizer identifies non-overlapping labelled spans of tokens.
|
||||
|
@ -164,10 +169,20 @@ def make_beam_ner(
|
|||
beam_width=beam_width,
|
||||
beam_density=beam_density,
|
||||
beam_update_prob=beam_update_prob,
|
||||
incorrect_spans_key=incorrect_spans_key
|
||||
incorrect_spans_key=incorrect_spans_key,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def ner_score(examples, **kwargs):
|
||||
return get_ner_prf(examples, **kwargs)
|
||||
|
||||
|
||||
@registry.scorers("spacy.ner_scorer.v1")
|
||||
def make_ner_scorer():
|
||||
return ner_score
|
||||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
"""Pipeline component for named entity recognition.
|
||||
|
||||
|
@ -188,6 +203,7 @@ cdef class EntityRecognizer(Parser):
|
|||
beam_update_prob=0.0,
|
||||
multitasks=tuple(),
|
||||
incorrect_spans_key=None,
|
||||
scorer=ner_score,
|
||||
):
|
||||
"""Create an EntityRecognizer.
|
||||
"""
|
||||
|
@ -204,6 +220,7 @@ cdef class EntityRecognizer(Parser):
|
|||
beam_update_prob=beam_update_prob,
|
||||
multitasks=multitasks,
|
||||
incorrect_spans_key=incorrect_spans_key,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
def add_multitask_objective(self, mt_component):
|
||||
|
@ -227,17 +244,6 @@ cdef class EntityRecognizer(Parser):
|
|||
if move[0] in ("B", "I", "L", "U"))
|
||||
return tuple(sorted(labels))
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
|
||||
|
||||
DOCS: https://spacy.io/api/entityrecognizer#score
|
||||
"""
|
||||
validate_examples(examples, "EntityRecognizer.score")
|
||||
return get_ner_prf(examples)
|
||||
|
||||
def scored_ents(self, beams):
|
||||
"""Return a dictionary of (start, end, label) tuples with corresponding scores
|
||||
for each beam/doc that was processed.
|
||||
|
|
|
@ -81,6 +81,17 @@ cdef class Pipe:
|
|||
|
||||
DOCS: https://spacy.io/api/pipe#score
|
||||
"""
|
||||
if hasattr(self, "scorer") and self.scorer is not None:
|
||||
scorer_kwargs = {}
|
||||
# use default settings from cfg (e.g., threshold)
|
||||
if hasattr(self, "cfg") and isinstance(self.cfg, dict):
|
||||
scorer_kwargs.update(self.cfg)
|
||||
# override self.cfg["labels"] with self.labels
|
||||
if hasattr(self, "labels"):
|
||||
scorer_kwargs["labels"] = self.labels
|
||||
# override with kwargs settings
|
||||
scorer_kwargs.update(kwargs)
|
||||
return self.scorer(examples, **scorer_kwargs)
|
||||
return {}
|
||||
|
||||
@property
|
||||
|
|
|
@ -1,26 +1,29 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Callable
|
||||
import srsly
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
from .pipe import Pipe
|
||||
from .senter import senter_score
|
||||
from ..language import Language
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples
|
||||
from .. import util
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"sentencizer",
|
||||
assigns=["token.is_sent_start", "doc.sents"],
|
||||
default_config={"punct_chars": None},
|
||||
default_config={"punct_chars": None, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||
)
|
||||
def make_sentencizer(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
punct_chars: Optional[List[str]]
|
||||
punct_chars: Optional[List[str]],
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Sentencizer(name, punct_chars=punct_chars)
|
||||
return Sentencizer(name, punct_chars=punct_chars, scorer=scorer)
|
||||
|
||||
|
||||
class Sentencizer(Pipe):
|
||||
|
@ -41,7 +44,13 @@ class Sentencizer(Pipe):
|
|||
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
||||
'。', '。']
|
||||
|
||||
def __init__(self, name="sentencizer", *, punct_chars=None):
|
||||
def __init__(
|
||||
self,
|
||||
name="sentencizer",
|
||||
*,
|
||||
punct_chars=None,
|
||||
scorer=senter_score,
|
||||
):
|
||||
"""Initialize the sentencizer.
|
||||
|
||||
punct_chars (list): Punctuation characters to split on. Will be
|
||||
|
@ -55,6 +64,7 @@ class Sentencizer(Pipe):
|
|||
self.punct_chars = set(punct_chars)
|
||||
else:
|
||||
self.punct_chars = set(self.default_punct_chars)
|
||||
self.scorer = scorer
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||
|
@ -122,22 +132,6 @@ class Sentencizer(Pipe):
|
|||
else:
|
||||
doc.c[j].sent_start = -1
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
validate_examples(examples, "Sentencizer.score")
|
||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the sentencizer to a bytestring.
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from itertools import islice
|
||||
from typing import Optional, Callable
|
||||
|
||||
import srsly
|
||||
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
||||
|
@ -11,6 +12,7 @@ from ..language import Language
|
|||
from ..errors import Errors
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -34,11 +36,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"senter",
|
||||
assigns=["token.is_sent_start"],
|
||||
default_config={"model": DEFAULT_SENTER_MODEL},
|
||||
default_config={"model": DEFAULT_SENTER_MODEL, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||
)
|
||||
def make_senter(nlp: Language, name: str, model: Model):
|
||||
return SentenceRecognizer(nlp.vocab, model, name)
|
||||
def make_senter(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
|
||||
return SentenceRecognizer(nlp.vocab, model, name, scorer=scorer)
|
||||
|
||||
|
||||
def senter_score(examples, **kwargs):
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
|
||||
@registry.scorers("spacy.senter_scorer.v1")
|
||||
def make_senter_scorer():
|
||||
return senter_score
|
||||
|
||||
|
||||
class SentenceRecognizer(Tagger):
|
||||
|
@ -46,7 +62,7 @@ class SentenceRecognizer(Tagger):
|
|||
|
||||
DOCS: https://spacy.io/api/sentencerecognizer
|
||||
"""
|
||||
def __init__(self, vocab, model, name="senter"):
|
||||
def __init__(self, vocab, model, name="senter", *, scorer=senter_score):
|
||||
"""Initialize a sentence recognizer.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
|
@ -61,6 +77,7 @@ class SentenceRecognizer(Tagger):
|
|||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
self.cfg = {}
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -153,18 +170,3 @@ class SentenceRecognizer(Tagger):
|
|||
|
||||
def add_label(self, label, values=None):
|
||||
raise NotImplementedError
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
||||
DOCS: https://spacy.io/api/sentencerecognizer#score
|
||||
"""
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
validate_examples(examples, "SentenceRecognizer.score")
|
||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
|
|
@ -98,6 +98,7 @@ def build_ngram_range_suggester(
|
|||
"max_positive": None,
|
||||
"model": DEFAULT_SPANCAT_MODEL,
|
||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||
)
|
||||
|
@ -107,8 +108,9 @@ def make_spancat(
|
|||
suggester: Callable[[List[Doc]], Ragged],
|
||||
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||
spans_key: str,
|
||||
threshold: float = 0.5,
|
||||
max_positive: Optional[int] = None,
|
||||
scorer: Optional[Callable],
|
||||
threshold: float,
|
||||
max_positive: Optional[int],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
|
@ -138,9 +140,28 @@ def make_spancat(
|
|||
threshold=threshold,
|
||||
max_positive=max_positive,
|
||||
name=name,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
kwargs = dict(kwargs)
|
||||
attr_prefix = "spans_"
|
||||
key = kwargs["spans_key"]
|
||||
kwargs.setdefault("attr", f"{attr_prefix}{key}")
|
||||
kwargs.setdefault("allow_overlap", True)
|
||||
kwargs.setdefault(
|
||||
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
|
||||
)
|
||||
kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
|
||||
return Scorer.score_spans(examples, **kwargs)
|
||||
|
||||
|
||||
@registry.scorers("spacy.spancat_scorer.v1")
|
||||
def make_spancat_scorer():
|
||||
return spancat_score
|
||||
|
||||
|
||||
class SpanCategorizer(TrainablePipe):
|
||||
"""Pipeline component to label spans of text.
|
||||
|
||||
|
@ -157,6 +178,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
spans_key: str = "spans",
|
||||
threshold: float = 0.5,
|
||||
max_positive: Optional[int] = None,
|
||||
scorer: Optional[Callable] = spancat_score,
|
||||
) -> None:
|
||||
"""Initialize the span categorizer.
|
||||
|
||||
|
@ -172,6 +194,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
self.suggester = suggester
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
|
@ -373,28 +396,6 @@ class SpanCategorizer(TrainablePipe):
|
|||
else:
|
||||
self.model.initialize()
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||
|
||||
DOCS: https://spacy.io/api/spancategorizer#score
|
||||
"""
|
||||
validate_examples(examples, "SpanCategorizer.score")
|
||||
self._validate_categories(examples)
|
||||
kwargs = dict(kwargs)
|
||||
attr_prefix = "spans_"
|
||||
kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
|
||||
kwargs.setdefault("labels", self.labels)
|
||||
kwargs.setdefault("multi_label", True)
|
||||
kwargs.setdefault("threshold", self.cfg["threshold"])
|
||||
kwargs.setdefault(
|
||||
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
|
||||
)
|
||||
kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
|
||||
return Scorer.score_spans(examples, **kwargs)
|
||||
|
||||
def _validate_categories(self, examples):
|
||||
# TODO
|
||||
pass
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Callable, Optional
|
||||
import numpy
|
||||
import srsly
|
||||
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
|
||||
|
@ -18,6 +19,7 @@ from ..parts_of_speech import X
|
|||
from ..errors import Errors, Warnings
|
||||
from ..scorer import Scorer
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -41,10 +43,10 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"tagger",
|
||||
assigns=["token.tag"],
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL},
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
|
||||
default_score_weights={"tag_acc": 1.0},
|
||||
)
|
||||
def make_tagger(nlp: Language, name: str, model: Model):
|
||||
def make_tagger(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
|
||||
"""Construct a part-of-speech tagger component.
|
||||
|
||||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||
|
@ -52,7 +54,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
|
|||
in size, and be normalized as probabilities (all scores between 0 and 1,
|
||||
with the rows summing to 1).
|
||||
"""
|
||||
return Tagger(nlp.vocab, model, name)
|
||||
return Tagger(nlp.vocab, model, name, scorer=scorer)
|
||||
|
||||
|
||||
def tagger_score(examples, **kwargs):
|
||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||
|
||||
|
||||
@registry.scorers("spacy.tagger_scorer.v1")
|
||||
def make_tagger_scorer():
|
||||
return tagger_score
|
||||
|
||||
|
||||
class Tagger(TrainablePipe):
|
||||
|
@ -60,7 +71,7 @@ class Tagger(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tagger
|
||||
"""
|
||||
def __init__(self, vocab, model, name="tagger"):
|
||||
def __init__(self, vocab, model, name="tagger", *, scorer=tagger_score):
|
||||
"""Initialize a part-of-speech tagger.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
|
@ -76,6 +87,7 @@ class Tagger(TrainablePipe):
|
|||
self._rehearsal_model = None
|
||||
cfg = {"labels": []}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -289,15 +301,3 @@ class Tagger(TrainablePipe):
|
|||
self.cfg["labels"].append(label)
|
||||
self.vocab.strings.add(label)
|
||||
return 1
|
||||
|
||||
def score(self, examples, **kwargs):
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by
|
||||
Scorer.score_token_attr for the attributes "tag".
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#score
|
||||
"""
|
||||
validate_examples(examples, "Tagger.score")
|
||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||
|
|
|
@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
|
|||
from ..errors import Errors
|
||||
from ..scorer import Scorer
|
||||
from ..tokens import Doc
|
||||
from ..util import registry
|
||||
from ..vocab import Vocab
|
||||
|
||||
|
||||
|
@ -70,7 +71,11 @@ subword_features = true
|
|||
@Language.factory(
|
||||
"textcat",
|
||||
assigns=["doc.cats"],
|
||||
default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
|
||||
default_config={
|
||||
"threshold": 0.5,
|
||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
"cats_score_desc": None,
|
||||
|
@ -86,7 +91,11 @@ subword_features = true
|
|||
},
|
||||
)
|
||||
def make_textcat(
|
||||
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
|
||||
nlp: Language,
|
||||
name: str,
|
||||
model: Model[List[Doc], List[Floats2d]],
|
||||
threshold: float,
|
||||
scorer: Optional[Callable],
|
||||
) -> "TextCategorizer":
|
||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels are considered
|
||||
|
@ -96,7 +105,21 @@ def make_textcat(
|
|||
scores for each category.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
"""
|
||||
return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
||||
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
|
||||
|
||||
|
||||
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
return Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
multi_label=False,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@registry.scorers("spacy.textcat_scorer.v1")
|
||||
def make_textcat_scorer():
|
||||
return textcat_score
|
||||
|
||||
|
||||
class TextCategorizer(TrainablePipe):
|
||||
|
@ -106,7 +129,13 @@ class TextCategorizer(TrainablePipe):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Model,
|
||||
name: str = "textcat",
|
||||
*,
|
||||
threshold: float,
|
||||
scorer: Optional[Callable] = textcat_score,
|
||||
) -> None:
|
||||
"""Initialize a text categorizer for single-label classification.
|
||||
|
||||
|
@ -124,6 +153,7 @@ class TextCategorizer(TrainablePipe):
|
|||
self._rehearsal_model = None
|
||||
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
||||
self.cfg = dict(cfg)
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def labels(self) -> Tuple[str]:
|
||||
|
@ -354,26 +384,6 @@ class TextCategorizer(TrainablePipe):
|
|||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#score
|
||||
"""
|
||||
validate_examples(examples, "TextCategorizer.score")
|
||||
self._validate_categories(examples)
|
||||
kwargs.setdefault("threshold", self.cfg["threshold"])
|
||||
kwargs.setdefault("positive_label", self.cfg["positive_label"])
|
||||
return Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
labels=self.labels,
|
||||
multi_label=False,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _validate_categories(self, examples: List[Example]):
|
||||
"""Check whether the provided examples all have single-label cats annotations."""
|
||||
for ex in examples:
|
||||
|
|
|
@ -5,10 +5,11 @@ from thinc.api import Model, Config
|
|||
from thinc.types import Floats2d
|
||||
|
||||
from ..language import Language
|
||||
from ..training import Example, validate_examples, validate_get_examples
|
||||
from ..training import Example, validate_get_examples
|
||||
from ..errors import Errors
|
||||
from ..scorer import Scorer
|
||||
from ..tokens import Doc
|
||||
from ..util import registry
|
||||
from ..vocab import Vocab
|
||||
from .textcat import TextCategorizer
|
||||
|
||||
|
@ -70,7 +71,11 @@ subword_features = true
|
|||
@Language.factory(
|
||||
"textcat_multilabel",
|
||||
assigns=["doc.cats"],
|
||||
default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
|
||||
default_config={
|
||||
"threshold": 0.5,
|
||||
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
"cats_score_desc": None,
|
||||
|
@ -86,7 +91,11 @@ subword_features = true
|
|||
},
|
||||
)
|
||||
def make_multilabel_textcat(
|
||||
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
|
||||
nlp: Language,
|
||||
name: str,
|
||||
model: Model[List[Doc], List[Floats2d]],
|
||||
threshold: float,
|
||||
scorer: Optional[Callable],
|
||||
) -> "TextCategorizer":
|
||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels are considered
|
||||
|
@ -97,7 +106,23 @@ def make_multilabel_textcat(
|
|||
scores for each category.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
"""
|
||||
return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
||||
return MultiLabel_TextCategorizer(
|
||||
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
||||
)
|
||||
|
||||
|
||||
def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
return Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
multi_label=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
|
||||
def make_textcat_multilabel_scorer():
|
||||
return textcat_multilabel_score
|
||||
|
||||
|
||||
class MultiLabel_TextCategorizer(TextCategorizer):
|
||||
|
@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
name: str = "textcat_multilabel",
|
||||
*,
|
||||
threshold: float,
|
||||
scorer: Optional[Callable] = textcat_multilabel_score,
|
||||
) -> None:
|
||||
"""Initialize a text categorizer for multi-label classification.
|
||||
|
||||
|
@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
self._rehearsal_model = None
|
||||
cfg = {"labels": [], "threshold": threshold}
|
||||
self.cfg = dict(cfg)
|
||||
self.scorer = scorer
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
|
@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#score
|
||||
"""
|
||||
validate_examples(examples, "MultiLabel_TextCategorizer.score")
|
||||
kwargs.setdefault("threshold", self.cfg["threshold"])
|
||||
return Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
labels=self.labels,
|
||||
multi_label=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _validate_categories(self, examples: List[Example]):
|
||||
"""This component allows any type of single- or multi-label annotations.
|
||||
This method overwrites the more strict one from 'textcat'."""
|
||||
|
|
|
@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
|
|||
cdef public Vocab vocab
|
||||
cdef public object model
|
||||
cdef public object cfg
|
||||
cdef public object scorer
|
||||
|
|
|
@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
|
|||
beam_density=0.0,
|
||||
beam_update_prob=0.0,
|
||||
multitasks=tuple(),
|
||||
incorrect_spans_key=None
|
||||
incorrect_spans_key=None,
|
||||
scorer=None,
|
||||
):
|
||||
"""Create a Parser.
|
||||
|
||||
|
@ -117,6 +118,7 @@ cdef class Parser(TrainablePipe):
|
|||
self.add_multitask_objective(multitask)
|
||||
|
||||
self._rehearsal_model = None
|
||||
self.scorer = scorer
|
||||
|
||||
def __getnewargs_ex__(self):
|
||||
"""This allows pickling the Parser and its keyword-only init arguments"""
|
||||
|
|
|
@ -537,7 +537,7 @@ class Scorer:
|
|||
|
||||
@staticmethod
|
||||
def score_links(
|
||||
examples: Iterable[Example], *, negative_labels: Iterable[str]
|
||||
examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
|
||||
) -> Dict[str, Any]:
|
||||
"""Returns PRF for predicted links on the entity level.
|
||||
To disentangle the performance of the NEL from the NER,
|
||||
|
@ -711,7 +711,7 @@ class Scorer:
|
|||
}
|
||||
|
||||
|
||||
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
|
||||
def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
|
||||
score_per_type = defaultdict(PRFScore)
|
||||
for eg in examples:
|
||||
|
|
|
@ -32,24 +32,6 @@ def pattern_dicts():
|
|||
]
|
||||
|
||||
|
||||
@registry.misc("attribute_ruler_patterns")
|
||||
def attribute_ruler_patterns():
|
||||
return [
|
||||
{
|
||||
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
|
||||
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
||||
},
|
||||
# one pattern sets the lemma
|
||||
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
||||
# another pattern sets the morphology
|
||||
{
|
||||
"patterns": [[{"ORTH": "test"}]],
|
||||
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
||||
"index": 0,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tag_map():
|
||||
return {
|
||||
|
@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
nlp.remove_pipe("attribute_ruler")
|
||||
|
||||
# initialize with patterns from misc registry
|
||||
@registry.misc("attribute_ruler_patterns")
|
||||
def attribute_ruler_patterns():
|
||||
return [
|
||||
{
|
||||
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
|
||||
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
||||
},
|
||||
# one pattern sets the lemma
|
||||
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
||||
# another pattern sets the morphology
|
||||
{
|
||||
"patterns": [[{"ORTH": "test"}]],
|
||||
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
||||
"index": 0,
|
||||
},
|
||||
]
|
||||
|
||||
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
||||
"patterns": {"@misc": "attribute_ruler_patterns"}
|
||||
}
|
||||
|
@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
|
|||
assert scores["lemma_acc"] == pytest.approx(0.2)
|
||||
# no morphs are set
|
||||
assert scores["morph_acc"] is None
|
||||
nlp.remove_pipe("attribute_ruler")
|
||||
|
||||
# test with custom scorer
|
||||
@registry.misc("weird_scorer.v1")
|
||||
def make_weird_scorer():
|
||||
def weird_scorer(examples, weird_score, **kwargs):
|
||||
return {"weird_score": weird_score}
|
||||
|
||||
return weird_scorer
|
||||
|
||||
ruler = nlp.add_pipe(
|
||||
"attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
|
||||
)
|
||||
ruler.initialize(lambda: [], patterns=pattern_dicts)
|
||||
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
|
||||
assert scores["weird_score"] == 0.12345
|
||||
assert "token_acc" in scores
|
||||
assert "lemma_acc" not in scores
|
||||
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
|
||||
assert scores["weird_score"] == 0.23456
|
||||
|
||||
|
||||
def test_attributeruler_rule_order(nlp):
|
||||
|
|
|
@ -95,6 +95,7 @@ class registry(thinc.registry):
|
|||
readers = catalogue.create("spacy", "readers", entry_points=True)
|
||||
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
|
||||
loggers = catalogue.create("spacy", "loggers", entry_points=True)
|
||||
scorers = catalogue.create("spacy", "scorers", entry_points=True)
|
||||
# These are factories registered via third-party packages and the
|
||||
# spacy_factories entry point. This registry only exists so we can easily
|
||||
# load them via the entry points. The "true" factories are added via the
|
||||
|
|
Loading…
Reference in New Issue
Block a user