mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Refactor scoring methods to use registered functions (#8766)
* Add scorer option to components Add an optional `scorer` parameter to all pipeline components. If a scoring function is provided, it overrides the default scoring method for that component. * Add registered scorers for all components * Add `scorers` registry * Move all scoring methods outside of components as independent functions and register * Use the registered scoring methods as defaults in configs and inits Additional: * The scoring methods no longer have access to the full component, so use settings from `cfg` as default scorer options to handle settings such as `labels`, `threshold`, and `positive_label` * The `attribute_ruler` scoring method no longer has access to the patterns, so all scoring methods are called * Bug fix: `spancat` scoring method is updated to set `allow_overlap` to score overlapping spans correctly * Update Russian lemmatizer to use direct score method * Check type of cfg in Pipe.score * Fix check * Update spacy/pipeline/sentencizer.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Remove validate_examples from scoring functions * Use Pipe.labels instead of Pipe.cfg["labels"] Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
ee011ca963
commit
f99d6d5e39
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
|
@ -23,13 +23,25 @@ class Bengali(Language):
|
||||||
@Bengali.factory(
|
@Bengali.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bengali"]
|
__all__ = ["Bengali"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
|
@ -28,13 +28,25 @@ class Catalan(Language):
|
||||||
@Catalan.factory(
|
@Catalan.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return CatalanLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Catalan"]
|
__all__ = ["Catalan"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -28,13 +28,25 @@ class Greek(Language):
|
||||||
@Greek.factory(
|
@Greek.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return GreekLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Greek"]
|
__all__ = ["Greek"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -26,13 +26,25 @@ class English(Language):
|
||||||
@English.factory(
|
@English.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return EnglishLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["English"]
|
__all__ = ["English"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -26,13 +26,25 @@ class Spanish(Language):
|
||||||
@Spanish.factory(
|
@Spanish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return SpanishLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Spanish"]
|
__all__ = ["Spanish"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -26,13 +26,25 @@ class Persian(Language):
|
||||||
@Persian.factory(
|
@Persian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Persian"]
|
__all__ = ["Persian"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
|
@ -31,13 +31,25 @@ class French(Language):
|
||||||
@French.factory(
|
@French.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return FrenchLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["French"]
|
__all__ = ["French"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -23,13 +23,25 @@ class Italian(Language):
|
||||||
@Italian.factory(
|
@Italian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "pos_lookup",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return ItalianLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Italian"]
|
__all__ = ["Italian"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .lemmatizer import MacedonianLemmatizer
|
from .lemmatizer import MacedonianLemmatizer
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -38,13 +38,25 @@ class Macedonian(Language):
|
||||||
@Macedonian.factory(
|
@Macedonian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return MacedonianLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Macedonian"]
|
__all__ = ["Macedonian"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
@ -26,13 +26,25 @@ class Norwegian(Language):
|
||||||
@Norwegian.factory(
|
@Norwegian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Norwegian"]
|
__all__ = ["Norwegian"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
|
@ -30,13 +30,25 @@ class Dutch(Language):
|
||||||
@Dutch.factory(
|
@Dutch.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return DutchLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Dutch"]
|
__all__ = ["Dutch"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
|
@ -33,13 +33,25 @@ class Polish(Language):
|
||||||
@Polish.factory(
|
@Polish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "pos_lookup",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return PolishLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Polish"]
|
__all__ = ["Polish"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -22,7 +22,12 @@ class Russian(Language):
|
||||||
@Russian.factory(
|
@Russian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "pymorphy2",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
|
@ -31,8 +36,11 @@ def make_lemmatizer(
|
||||||
name: str,
|
name: str,
|
||||||
mode: str,
|
mode: str,
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return RussianLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Russian"]
|
__all__ = ["Russian"]
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
from typing import Optional, List, Dict, Tuple
|
from typing import Optional, List, Dict, Tuple, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
from ...pipeline.lemmatizer import lemmatizer_score
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy2",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
if mode == "pymorphy2":
|
if mode == "pymorphy2":
|
||||||
try:
|
try:
|
||||||
|
@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer()
|
self._morph = MorphAnalyzer()
|
||||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
|
||||||
|
|
||||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||||
string = token.text
|
string = token.text
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -29,13 +29,25 @@ class Swedish(Language):
|
||||||
@Swedish.factory(
|
@Swedish.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Swedish"]
|
__all__ = ["Swedish"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
|
@ -23,13 +23,25 @@ class Ukrainian(Language):
|
||||||
@Ukrainian.factory(
|
@Ukrainian.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "pymorphy2", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "pymorphy2",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return UkrainianLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Ukrainian"]
|
__all__ = ["Ukrainian"]
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
from typing import Optional
|
from typing import Optional, Callable
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from ..ru.lemmatizer import RussianLemmatizer
|
from ..ru.lemmatizer import RussianLemmatizer
|
||||||
|
from ...pipeline.lemmatizer import lemmatizer_score
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy2",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
if mode == "pymorphy2":
|
if mode == "pymorphy2":
|
||||||
try:
|
try:
|
||||||
|
@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer(lang="uk")
|
self._morph = MorphAnalyzer(lang="uk")
|
||||||
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
|
||||||
|
|
|
@ -5,15 +5,15 @@ from pathlib import Path
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..training import validate_examples, Example
|
from ..training import Example
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..matcher import Matcher
|
from ..matcher import Matcher
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..symbols import IDS, TAG, POS, MORPH, LEMMA
|
from ..symbols import IDS
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
|
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList, registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,9 +23,43 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
|
||||||
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory("attribute_ruler", default_config={"validate": False})
|
@Language.factory(
|
||||||
def make_attribute_ruler(nlp: Language, name: str, validate: bool):
|
"attribute_ruler",
|
||||||
return AttributeRuler(nlp.vocab, name, validate=validate)
|
default_config={
|
||||||
|
"validate": False,
|
||||||
|
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def make_attribute_ruler(
|
||||||
|
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
|
||||||
|
):
|
||||||
|
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
|
def attribute_ruler_score(
|
||||||
|
examples: Iterable[Example], **kwargs
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
def morph_key_getter(token, attr):
|
||||||
|
return getattr(token, attr).key
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
||||||
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
|
results.update(
|
||||||
|
Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
|
||||||
|
)
|
||||||
|
results.update(
|
||||||
|
Scorer.score_token_attr_per_feat(
|
||||||
|
examples, "morph", getter=morph_key_getter, **kwargs
|
||||||
|
)
|
||||||
|
)
|
||||||
|
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.attribute_ruler_scorer.v1")
|
||||||
|
def make_attribute_ruler_scorer():
|
||||||
|
return attribute_ruler_score
|
||||||
|
|
||||||
|
|
||||||
class AttributeRuler(Pipe):
|
class AttributeRuler(Pipe):
|
||||||
|
@ -36,7 +70,12 @@ class AttributeRuler(Pipe):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
|
self,
|
||||||
|
vocab: Vocab,
|
||||||
|
name: str = "attribute_ruler",
|
||||||
|
*,
|
||||||
|
validate: bool = False,
|
||||||
|
scorer: Optional[Callable] = attribute_ruler_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Create the AttributeRuler. After creation, you can add patterns
|
"""Create the AttributeRuler. After creation, you can add patterns
|
||||||
with the `.initialize()` or `.add_patterns()` methods, or load patterns
|
with the `.initialize()` or `.add_patterns()` methods, or load patterns
|
||||||
|
@ -57,6 +96,7 @@ class AttributeRuler(Pipe):
|
||||||
self.attrs = []
|
self.attrs = []
|
||||||
self._attrs_unnormed = [] # store for reference
|
self._attrs_unnormed = [] # store for reference
|
||||||
self.indices = []
|
self.indices = []
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def clear(self) -> None:
|
def clear(self) -> None:
|
||||||
"""Reset all patterns."""
|
"""Reset all patterns."""
|
||||||
|
@ -228,45 +268,6 @@ class AttributeRuler(Pipe):
|
||||||
all_patterns.append(p)
|
all_patterns.append(p)
|
||||||
return all_patterns
|
return all_patterns
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by
|
|
||||||
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
|
|
||||||
and "lemma" for the target token attributes.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#score
|
|
||||||
"""
|
|
||||||
|
|
||||||
def morph_key_getter(token, attr):
|
|
||||||
return getattr(token, attr).key
|
|
||||||
|
|
||||||
validate_examples(examples, "AttributeRuler.score")
|
|
||||||
results = {}
|
|
||||||
attrs = set()
|
|
||||||
for token_attrs in self.attrs:
|
|
||||||
attrs.update(token_attrs)
|
|
||||||
for attr in attrs:
|
|
||||||
if attr == TAG:
|
|
||||||
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
|
|
||||||
elif attr == POS:
|
|
||||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
|
||||||
elif attr == MORPH:
|
|
||||||
results.update(
|
|
||||||
Scorer.score_token_attr(
|
|
||||||
examples, "morph", getter=morph_key_getter, **kwargs
|
|
||||||
)
|
|
||||||
)
|
|
||||||
results.update(
|
|
||||||
Scorer.score_token_attr_per_feat(
|
|
||||||
examples, "morph", getter=morph_key_getter, **kwargs
|
|
||||||
)
|
|
||||||
)
|
|
||||||
elif attr == LEMMA:
|
|
||||||
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
|
||||||
return results
|
|
||||||
|
|
||||||
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||||
"""Serialize the AttributeRuler to a bytestring.
|
"""Serialize the AttributeRuler to a bytestring.
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable, Callable
|
||||||
from thinc.api import Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
from ._parser_internals.transition_system import TransitionSystem
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
|
@ -12,7 +12,7 @@ from ..language import Language
|
||||||
from ._parser_internals import nonproj
|
from ._parser_internals import nonproj
|
||||||
from ._parser_internals.nonproj import DELIMITER
|
from ._parser_internals.nonproj import DELIMITER
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"learn_tokens": False,
|
"learn_tokens": False,
|
||||||
"min_action_freq": 30,
|
"min_action_freq": 30,
|
||||||
"model": DEFAULT_PARSER_MODEL,
|
"model": DEFAULT_PARSER_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"dep_uas": 0.5,
|
"dep_uas": 0.5,
|
||||||
|
@ -63,7 +64,8 @@ def make_parser(
|
||||||
moves: Optional[TransitionSystem],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
learn_tokens: bool,
|
learn_tokens: bool,
|
||||||
min_action_freq: int
|
min_action_freq: int,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
"""Create a transition-based DependencyParser component. The dependency parser
|
"""Create a transition-based DependencyParser component. The dependency parser
|
||||||
jointly learns sentence segmentation and labelled dependency parsing, and can
|
jointly learns sentence segmentation and labelled dependency parsing, and can
|
||||||
|
@ -115,7 +117,8 @@ def make_parser(
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
# At some point in the future we can try to implement support for
|
# At some point in the future we can try to implement support for
|
||||||
# partial annotations, perhaps only in the beam objective.
|
# partial annotations, perhaps only in the beam objective.
|
||||||
incorrect_spans_key=None
|
incorrect_spans_key=None,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
|
@ -130,6 +133,7 @@ def make_parser(
|
||||||
"learn_tokens": False,
|
"learn_tokens": False,
|
||||||
"min_action_freq": 30,
|
"min_action_freq": 30,
|
||||||
"model": DEFAULT_PARSER_MODEL,
|
"model": DEFAULT_PARSER_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"dep_uas": 0.5,
|
"dep_uas": 0.5,
|
||||||
|
@ -151,6 +155,7 @@ def make_beam_parser(
|
||||||
beam_width: int,
|
beam_width: int,
|
||||||
beam_density: float,
|
beam_density: float,
|
||||||
beam_update_prob: float,
|
beam_update_prob: float,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
"""Create a transition-based DependencyParser component that uses beam-search.
|
"""Create a transition-based DependencyParser component that uses beam-search.
|
||||||
The dependency parser jointly learns sentence segmentation and labelled
|
The dependency parser jointly learns sentence segmentation and labelled
|
||||||
|
@ -207,10 +212,41 @@ def make_beam_parser(
|
||||||
min_action_freq=min_action_freq,
|
min_action_freq=min_action_freq,
|
||||||
# At some point in the future we can try to implement support for
|
# At some point in the future we can try to implement support for
|
||||||
# partial annotations, perhaps only in the beam objective.
|
# partial annotations, perhaps only in the beam objective.
|
||||||
incorrect_spans_key=None
|
incorrect_spans_key=None,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parser_score(examples, **kwargs):
|
||||||
|
"""Score a batch of examples.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): The examples to score.
|
||||||
|
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
||||||
|
and Scorer.score_deps.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/dependencyparser#score
|
||||||
|
"""
|
||||||
|
def has_sents(doc):
|
||||||
|
return doc.has_annotation("SENT_START")
|
||||||
|
|
||||||
|
def dep_getter(token, attr):
|
||||||
|
dep = getattr(token, attr)
|
||||||
|
dep = token.vocab.strings.as_string(dep).lower()
|
||||||
|
return dep
|
||||||
|
results = {}
|
||||||
|
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||||
|
kwargs.setdefault("getter", dep_getter)
|
||||||
|
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||||
|
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||||
|
del results["sents_per_type"]
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.parser_scorer.v1")
|
||||||
|
def make_parser_scorer():
|
||||||
|
return parser_score
|
||||||
|
|
||||||
|
|
||||||
cdef class DependencyParser(Parser):
|
cdef class DependencyParser(Parser):
|
||||||
"""Pipeline component for dependency parsing.
|
"""Pipeline component for dependency parsing.
|
||||||
|
|
||||||
|
@ -233,6 +269,7 @@ cdef class DependencyParser(Parser):
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
multitasks=tuple(),
|
multitasks=tuple(),
|
||||||
incorrect_spans_key=None,
|
incorrect_spans_key=None,
|
||||||
|
scorer=parser_score,
|
||||||
):
|
):
|
||||||
"""Create a DependencyParser.
|
"""Create a DependencyParser.
|
||||||
"""
|
"""
|
||||||
|
@ -249,6 +286,7 @@ cdef class DependencyParser(Parser):
|
||||||
beam_update_prob=beam_update_prob,
|
beam_update_prob=beam_update_prob,
|
||||||
multitasks=multitasks,
|
multitasks=multitasks,
|
||||||
incorrect_spans_key=incorrect_spans_key,
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -281,31 +319,6 @@ cdef class DependencyParser(Parser):
|
||||||
labels.add(label)
|
labels.add(label)
|
||||||
return tuple(sorted(labels))
|
return tuple(sorted(labels))
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
|
|
||||||
and Scorer.score_deps.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser#score
|
|
||||||
"""
|
|
||||||
def has_sents(doc):
|
|
||||||
return doc.has_annotation("SENT_START")
|
|
||||||
|
|
||||||
validate_examples(examples, "DependencyParser.score")
|
|
||||||
def dep_getter(token, attr):
|
|
||||||
dep = getattr(token, attr)
|
|
||||||
dep = token.vocab.strings.as_string(dep).lower()
|
|
||||||
return dep
|
|
||||||
results = {}
|
|
||||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
|
||||||
kwargs.setdefault("getter", dep_getter)
|
|
||||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
|
||||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
|
||||||
del results["sents_per_type"]
|
|
||||||
return results
|
|
||||||
|
|
||||||
def scored_parses(self, beams):
|
def scored_parses(self, beams):
|
||||||
"""Return two dictionaries with scores for each beam/doc that was processed:
|
"""Return two dictionaries with scores for each beam/doc that was processed:
|
||||||
one containing (i, head) keys, and another containing (i, label) keys.
|
one containing (i, head) keys, and another containing (i, label) keys.
|
||||||
|
|
|
@ -16,7 +16,7 @@ from ..language import Language
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..training import Example, validate_examples, validate_get_examples
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList, registry
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
|
||||||
|
@ -50,6 +50,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"incl_context": True,
|
"incl_context": True,
|
||||||
"entity_vector_length": 64,
|
"entity_vector_length": 64,
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"nel_micro_f": 1.0,
|
"nel_micro_f": 1.0,
|
||||||
|
@ -68,6 +69,7 @@ def make_entity_linker(
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
"""Construct an EntityLinker component.
|
"""Construct an EntityLinker component.
|
||||||
|
|
||||||
|
@ -92,9 +94,19 @@ def make_entity_linker(
|
||||||
incl_context=incl_context,
|
incl_context=incl_context,
|
||||||
entity_vector_length=entity_vector_length,
|
entity_vector_length=entity_vector_length,
|
||||||
get_candidates=get_candidates,
|
get_candidates=get_candidates,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def entity_linker_score(examples, **kwargs):
|
||||||
|
return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.entity_linker_scorer.v1")
|
||||||
|
def make_entity_linker_scorer():
|
||||||
|
return entity_linker_score
|
||||||
|
|
||||||
|
|
||||||
class EntityLinker(TrainablePipe):
|
class EntityLinker(TrainablePipe):
|
||||||
"""Pipeline component for named entity linking.
|
"""Pipeline component for named entity linking.
|
||||||
|
|
||||||
|
@ -115,6 +127,7 @@ class EntityLinker(TrainablePipe):
|
||||||
incl_context: bool,
|
incl_context: bool,
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
|
scorer: Optional[Callable] = entity_linker_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize an entity linker.
|
"""Initialize an entity linker.
|
||||||
|
|
||||||
|
@ -145,6 +158,7 @@ class EntityLinker(TrainablePipe):
|
||||||
# how many neighbour sentences to take into account
|
# how many neighbour sentences to take into account
|
||||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
"""Define the KB of this pipe by providing a function that will
|
"""Define the KB of this pipe by providing a function that will
|
||||||
|
@ -389,17 +403,6 @@ class EntityLinker(TrainablePipe):
|
||||||
for token in ent:
|
for token in ent:
|
||||||
token.ent_kb_id_ = kb_id
|
token.ent_kb_id_ = kb_id
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores.
|
|
||||||
|
|
||||||
DOCS TODO: https://spacy.io/api/entity_linker#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "EntityLinker.score")
|
|
||||||
return Scorer.score_links(examples, negative_labels=[self.NIL])
|
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()):
|
def to_bytes(self, *, exclude=tuple()):
|
||||||
"""Serialize the pipe to a bytestring.
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
|
|
@ -8,11 +8,10 @@ from .pipe import Pipe
|
||||||
from ..training import Example
|
from ..training import Example
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
from ..scorer import get_ner_prf
|
from ..scorer import get_ner_prf
|
||||||
from ..training import validate_examples
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_ENT_ID_SEP = "||"
|
DEFAULT_ENT_ID_SEP = "||"
|
||||||
|
@ -27,6 +26,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||||
"validate": False,
|
"validate": False,
|
||||||
"overwrite_ents": False,
|
"overwrite_ents": False,
|
||||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||||
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"ents_f": 1.0,
|
"ents_f": 1.0,
|
||||||
|
@ -42,6 +42,7 @@ def make_entity_ruler(
|
||||||
validate: bool,
|
validate: bool,
|
||||||
overwrite_ents: bool,
|
overwrite_ents: bool,
|
||||||
ent_id_sep: str,
|
ent_id_sep: str,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return EntityRuler(
|
return EntityRuler(
|
||||||
nlp,
|
nlp,
|
||||||
|
@ -50,9 +51,19 @@ def make_entity_ruler(
|
||||||
validate=validate,
|
validate=validate,
|
||||||
overwrite_ents=overwrite_ents,
|
overwrite_ents=overwrite_ents,
|
||||||
ent_id_sep=ent_id_sep,
|
ent_id_sep=ent_id_sep,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def entity_ruler_score(examples, **kwargs):
|
||||||
|
return get_ner_prf(examples)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.entity_ruler_scorer.v1")
|
||||||
|
def make_entity_ruler_scorer():
|
||||||
|
return entity_ruler_score
|
||||||
|
|
||||||
|
|
||||||
class EntityRuler(Pipe):
|
class EntityRuler(Pipe):
|
||||||
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
||||||
rules or exact phrase matches. It can be combined with the statistical
|
rules or exact phrase matches. It can be combined with the statistical
|
||||||
|
@ -74,6 +85,7 @@ class EntityRuler(Pipe):
|
||||||
overwrite_ents: bool = False,
|
overwrite_ents: bool = False,
|
||||||
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
||||||
patterns: Optional[List[PatternType]] = None,
|
patterns: Optional[List[PatternType]] = None,
|
||||||
|
scorer: Optional[Callable] = entity_ruler_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the entity ruler. If patterns are supplied here, they
|
"""Initialize the entity ruler. If patterns are supplied here, they
|
||||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||||
|
@ -112,6 +124,7 @@ class EntityRuler(Pipe):
|
||||||
self._ent_ids = defaultdict(dict)
|
self._ent_ids = defaultdict(dict)
|
||||||
if patterns is not None:
|
if patterns is not None:
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
"""The number of all patterns added to the entity ruler."""
|
"""The number of all patterns added to the entity ruler."""
|
||||||
|
@ -358,10 +371,6 @@ class EntityRuler(Pipe):
|
||||||
label = f"{label}{self.ent_id_sep}{ent_id}"
|
label = f"{label}{self.ent_id_sep}{ent_id}"
|
||||||
return label
|
return label
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
validate_examples(examples, "EntityRuler.score")
|
|
||||||
return get_ner_prf(examples)
|
|
||||||
|
|
||||||
def from_bytes(
|
def from_bytes(
|
||||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
) -> "EntityRuler":
|
) -> "EntityRuler":
|
||||||
|
|
|
@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc, Token
|
from ..tokens import Doc, Token
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..training import validate_examples
|
from ..util import logger, SimpleFrozenList, registry
|
||||||
from ..util import logger, SimpleFrozenList
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"lemmatizer",
|
"lemmatizer",
|
||||||
assigns=["token.lemma"],
|
assigns=["token.lemma"],
|
||||||
default_config={"model": None, "mode": "lookup", "overwrite": False},
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "lookup",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(
|
def make_lemmatizer(
|
||||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
return Lemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.lemmatizer_scorer.v1")
|
||||||
|
def make_lemmatizer_scorer():
|
||||||
|
return lemmatizer_score
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(Pipe):
|
class Lemmatizer(Pipe):
|
||||||
|
@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
|
||||||
*,
|
*,
|
||||||
mode: str = "lookup",
|
mode: str = "lookup",
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a Lemmatizer.
|
"""Initialize a Lemmatizer.
|
||||||
|
|
||||||
|
@ -89,6 +110,7 @@ class Lemmatizer(Pipe):
|
||||||
raise ValueError(Errors.E1003.format(mode=mode))
|
raise ValueError(Errors.E1003.format(mode=mode))
|
||||||
self.lemmatize = getattr(self, mode_attr)
|
self.lemmatize = getattr(self, mode_attr)
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def mode(self):
|
def mode(self):
|
||||||
|
@ -247,17 +269,6 @@ class Lemmatizer(Pipe):
|
||||||
"""
|
"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lemmatizer#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "Lemmatizer.score")
|
|
||||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
|
||||||
|
|
||||||
def to_disk(
|
def to_disk(
|
||||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
):
|
):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional, Union, Dict
|
from typing import Optional, Union, Dict, Callable
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
@ -17,6 +17,7 @@ from .tagger import Tagger
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
@ -48,15 +49,33 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"morphologizer",
|
"morphologizer",
|
||||||
assigns=["token.morph", "token.pos"],
|
assigns=["token.morph", "token.pos"],
|
||||||
default_config={"model": DEFAULT_MORPH_MODEL},
|
default_config={"model": DEFAULT_MORPH_MODEL, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
|
||||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||||
)
|
)
|
||||||
def make_morphologizer(
|
def make_morphologizer(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
model: Model,
|
model: Model,
|
||||||
name: str,
|
name: str,
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Morphologizer(nlp.vocab, model, name)
|
return Morphologizer(nlp.vocab, model, name, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
|
def morphologizer_score(examples, **kwargs):
|
||||||
|
def morph_key_getter(token, attr):
|
||||||
|
return getattr(token, attr).key
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
|
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||||
|
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||||
|
"morph", getter=morph_key_getter, **kwargs))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.morphologizer_scorer.v1")
|
||||||
|
def make_morphologizer_scorer():
|
||||||
|
return morphologizer_score
|
||||||
|
|
||||||
|
|
||||||
class Morphologizer(Tagger):
|
class Morphologizer(Tagger):
|
||||||
|
@ -67,6 +86,8 @@ class Morphologizer(Tagger):
|
||||||
vocab: Vocab,
|
vocab: Vocab,
|
||||||
model: Model,
|
model: Model,
|
||||||
name: str = "morphologizer",
|
name: str = "morphologizer",
|
||||||
|
*,
|
||||||
|
scorer: Optional[Callable] = morphologizer_score,
|
||||||
):
|
):
|
||||||
"""Initialize a morphologizer.
|
"""Initialize a morphologizer.
|
||||||
|
|
||||||
|
@ -87,6 +108,7 @@ class Morphologizer(Tagger):
|
||||||
# 2) labels_pos stores a mapping from morph+POS->POS
|
# 2) labels_pos stores a mapping from morph+POS->POS
|
||||||
cfg = {"labels_morph": {}, "labels_pos": {}}
|
cfg = {"labels_morph": {}, "labels_pos": {}}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -246,24 +268,3 @@ class Morphologizer(Tagger):
|
||||||
if self.model.ops.xp.isnan(loss):
|
if self.model.ops.xp.isnan(loss):
|
||||||
raise ValueError(Errors.E910.format(name=self.name))
|
raise ValueError(Errors.E910.format(name=self.name))
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by
|
|
||||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
|
||||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/morphologizer#score
|
|
||||||
"""
|
|
||||||
def morph_key_getter(token, attr):
|
|
||||||
return getattr(token, attr).key
|
|
||||||
|
|
||||||
validate_examples(examples, "Morphologizer.score")
|
|
||||||
results = {}
|
|
||||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
|
||||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
|
||||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
|
||||||
"morph", getter=morph_key_getter, **kwargs))
|
|
||||||
return results
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable, Callable
|
||||||
from thinc.api import Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
from ._parser_internals.transition_system import TransitionSystem
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
|
@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import get_ner_prf, PRFScore
|
from ..scorer import get_ner_prf, PRFScore
|
||||||
from ..training import validate_examples
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
|
@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"moves": None,
|
"moves": None,
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
"model": DEFAULT_NER_MODEL,
|
"model": DEFAULT_NER_MODEL,
|
||||||
"incorrect_spans_key": None
|
"incorrect_spans_key": None,
|
||||||
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||||
|
|
||||||
|
@ -52,7 +53,8 @@ def make_ner(
|
||||||
model: Model,
|
model: Model,
|
||||||
moves: Optional[TransitionSystem],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
incorrect_spans_key: Optional[str]=None
|
incorrect_spans_key: Optional[str],
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
||||||
identifies non-overlapping labelled spans of tokens.
|
identifies non-overlapping labelled spans of tokens.
|
||||||
|
@ -92,6 +94,7 @@ def make_ner(
|
||||||
beam_width=1,
|
beam_width=1,
|
||||||
beam_density=0.0,
|
beam_density=0.0,
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
|
@ -104,7 +107,8 @@ def make_ner(
|
||||||
"beam_density": 0.01,
|
"beam_density": 0.01,
|
||||||
"beam_update_prob": 0.5,
|
"beam_update_prob": 0.5,
|
||||||
"beam_width": 32,
|
"beam_width": 32,
|
||||||
"incorrect_spans_key": None
|
"incorrect_spans_key": None,
|
||||||
|
"scorer": None,
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||||
)
|
)
|
||||||
|
@ -117,7 +121,8 @@ def make_beam_ner(
|
||||||
beam_width: int,
|
beam_width: int,
|
||||||
beam_density: float,
|
beam_density: float,
|
||||||
beam_update_prob: float,
|
beam_update_prob: float,
|
||||||
incorrect_spans_key: Optional[str]=None
|
incorrect_spans_key: Optional[str],
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
"""Create a transition-based EntityRecognizer component that uses beam-search.
|
"""Create a transition-based EntityRecognizer component that uses beam-search.
|
||||||
The entity recognizer identifies non-overlapping labelled spans of tokens.
|
The entity recognizer identifies non-overlapping labelled spans of tokens.
|
||||||
|
@ -164,10 +169,20 @@ def make_beam_ner(
|
||||||
beam_width=beam_width,
|
beam_width=beam_width,
|
||||||
beam_density=beam_density,
|
beam_density=beam_density,
|
||||||
beam_update_prob=beam_update_prob,
|
beam_update_prob=beam_update_prob,
|
||||||
incorrect_spans_key=incorrect_spans_key
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ner_score(examples, **kwargs):
|
||||||
|
return get_ner_prf(examples, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.ner_scorer.v1")
|
||||||
|
def make_ner_scorer():
|
||||||
|
return ner_score
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
"""Pipeline component for named entity recognition.
|
"""Pipeline component for named entity recognition.
|
||||||
|
|
||||||
|
@ -188,6 +203,7 @@ cdef class EntityRecognizer(Parser):
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
multitasks=tuple(),
|
multitasks=tuple(),
|
||||||
incorrect_spans_key=None,
|
incorrect_spans_key=None,
|
||||||
|
scorer=ner_score,
|
||||||
):
|
):
|
||||||
"""Create an EntityRecognizer.
|
"""Create an EntityRecognizer.
|
||||||
"""
|
"""
|
||||||
|
@ -204,6 +220,7 @@ cdef class EntityRecognizer(Parser):
|
||||||
beam_update_prob=beam_update_prob,
|
beam_update_prob=beam_update_prob,
|
||||||
multitasks=multitasks,
|
multitasks=multitasks,
|
||||||
incorrect_spans_key=incorrect_spans_key,
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
def add_multitask_objective(self, mt_component):
|
def add_multitask_objective(self, mt_component):
|
||||||
|
@ -227,17 +244,6 @@ cdef class EntityRecognizer(Parser):
|
||||||
if move[0] in ("B", "I", "L", "U"))
|
if move[0] in ("B", "I", "L", "U"))
|
||||||
return tuple(sorted(labels))
|
return tuple(sorted(labels))
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityrecognizer#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "EntityRecognizer.score")
|
|
||||||
return get_ner_prf(examples)
|
|
||||||
|
|
||||||
def scored_ents(self, beams):
|
def scored_ents(self, beams):
|
||||||
"""Return a dictionary of (start, end, label) tuples with corresponding scores
|
"""Return a dictionary of (start, end, label) tuples with corresponding scores
|
||||||
for each beam/doc that was processed.
|
for each beam/doc that was processed.
|
||||||
|
|
|
@ -81,6 +81,17 @@ cdef class Pipe:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/pipe#score
|
DOCS: https://spacy.io/api/pipe#score
|
||||||
"""
|
"""
|
||||||
|
if hasattr(self, "scorer") and self.scorer is not None:
|
||||||
|
scorer_kwargs = {}
|
||||||
|
# use default settings from cfg (e.g., threshold)
|
||||||
|
if hasattr(self, "cfg") and isinstance(self.cfg, dict):
|
||||||
|
scorer_kwargs.update(self.cfg)
|
||||||
|
# override self.cfg["labels"] with self.labels
|
||||||
|
if hasattr(self, "labels"):
|
||||||
|
scorer_kwargs["labels"] = self.labels
|
||||||
|
# override with kwargs settings
|
||||||
|
scorer_kwargs.update(kwargs)
|
||||||
|
return self.scorer(examples, **scorer_kwargs)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -1,26 +1,29 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Callable
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
|
from .senter import senter_score
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"sentencizer",
|
"sentencizer",
|
||||||
assigns=["token.is_sent_start", "doc.sents"],
|
assigns=["token.is_sent_start", "doc.sents"],
|
||||||
default_config={"punct_chars": None},
|
default_config={"punct_chars": None, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||||
)
|
)
|
||||||
def make_sentencizer(
|
def make_sentencizer(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
punct_chars: Optional[List[str]]
|
punct_chars: Optional[List[str]],
|
||||||
|
scorer: Optional[Callable],
|
||||||
):
|
):
|
||||||
return Sentencizer(name, punct_chars=punct_chars)
|
return Sentencizer(name, punct_chars=punct_chars, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
class Sentencizer(Pipe):
|
class Sentencizer(Pipe):
|
||||||
|
@ -41,7 +44,13 @@ class Sentencizer(Pipe):
|
||||||
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
||||||
'。', '。']
|
'。', '。']
|
||||||
|
|
||||||
def __init__(self, name="sentencizer", *, punct_chars=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
name="sentencizer",
|
||||||
|
*,
|
||||||
|
punct_chars=None,
|
||||||
|
scorer=senter_score,
|
||||||
|
):
|
||||||
"""Initialize the sentencizer.
|
"""Initialize the sentencizer.
|
||||||
|
|
||||||
punct_chars (list): Punctuation characters to split on. Will be
|
punct_chars (list): Punctuation characters to split on. Will be
|
||||||
|
@ -55,6 +64,7 @@ class Sentencizer(Pipe):
|
||||||
self.punct_chars = set(punct_chars)
|
self.punct_chars = set(punct_chars)
|
||||||
else:
|
else:
|
||||||
self.punct_chars = set(self.default_punct_chars)
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||||
|
@ -122,22 +132,6 @@ class Sentencizer(Pipe):
|
||||||
else:
|
else:
|
||||||
doc.c[j].sent_start = -1
|
doc.c[j].sent_start = -1
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#score
|
|
||||||
"""
|
|
||||||
def has_sents(doc):
|
|
||||||
return doc.has_annotation("SENT_START")
|
|
||||||
|
|
||||||
validate_examples(examples, "Sentencizer.score")
|
|
||||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
|
||||||
del results["sents_per_type"]
|
|
||||||
return results
|
|
||||||
|
|
||||||
def to_bytes(self, *, exclude=tuple()):
|
def to_bytes(self, *, exclude=tuple()):
|
||||||
"""Serialize the sentencizer to a bytestring.
|
"""Serialize the sentencizer to a bytestring.
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
from typing import Optional, Callable
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
||||||
|
@ -11,6 +12,7 @@ from ..language import Language
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,11 +36,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"senter",
|
"senter",
|
||||||
assigns=["token.is_sent_start"],
|
assigns=["token.is_sent_start"],
|
||||||
default_config={"model": DEFAULT_SENTER_MODEL},
|
default_config={"model": DEFAULT_SENTER_MODEL, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
|
||||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||||
)
|
)
|
||||||
def make_senter(nlp: Language, name: str, model: Model):
|
def make_senter(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
|
||||||
return SentenceRecognizer(nlp.vocab, model, name)
|
return SentenceRecognizer(nlp.vocab, model, name, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
|
def senter_score(examples, **kwargs):
|
||||||
|
def has_sents(doc):
|
||||||
|
return doc.has_annotation("SENT_START")
|
||||||
|
|
||||||
|
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||||
|
del results["sents_per_type"]
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.senter_scorer.v1")
|
||||||
|
def make_senter_scorer():
|
||||||
|
return senter_score
|
||||||
|
|
||||||
|
|
||||||
class SentenceRecognizer(Tagger):
|
class SentenceRecognizer(Tagger):
|
||||||
|
@ -46,7 +62,7 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer
|
DOCS: https://spacy.io/api/sentencerecognizer
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="senter"):
|
def __init__(self, vocab, model, name="senter", *, scorer=senter_score):
|
||||||
"""Initialize a sentence recognizer.
|
"""Initialize a sentence recognizer.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
@ -61,6 +77,7 @@ class SentenceRecognizer(Tagger):
|
||||||
self.name = name
|
self.name = name
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
self.cfg = {}
|
self.cfg = {}
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -153,18 +170,3 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
def add_label(self, label, values=None):
|
def add_label(self, label, values=None):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
|
|
||||||
DOCS: https://spacy.io/api/sentencerecognizer#score
|
|
||||||
"""
|
|
||||||
def has_sents(doc):
|
|
||||||
return doc.has_annotation("SENT_START")
|
|
||||||
|
|
||||||
validate_examples(examples, "SentenceRecognizer.score")
|
|
||||||
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
|
||||||
del results["sents_per_type"]
|
|
||||||
return results
|
|
||||||
|
|
|
@ -98,6 +98,7 @@ def build_ngram_range_suggester(
|
||||||
"max_positive": None,
|
"max_positive": None,
|
||||||
"model": DEFAULT_SPANCAT_MODEL,
|
"model": DEFAULT_SPANCAT_MODEL,
|
||||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||||
)
|
)
|
||||||
|
@ -107,8 +108,9 @@ def make_spancat(
|
||||||
suggester: Callable[[List[Doc]], Ragged],
|
suggester: Callable[[List[Doc]], Ragged],
|
||||||
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||||
spans_key: str,
|
spans_key: str,
|
||||||
threshold: float = 0.5,
|
scorer: Optional[Callable],
|
||||||
max_positive: Optional[int] = None,
|
threshold: float,
|
||||||
|
max_positive: Optional[int],
|
||||||
) -> "SpanCategorizer":
|
) -> "SpanCategorizer":
|
||||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||||
parts: a suggester function that proposes candidate spans, and a labeller
|
parts: a suggester function that proposes candidate spans, and a labeller
|
||||||
|
@ -138,9 +140,28 @@ def make_spancat(
|
||||||
threshold=threshold,
|
threshold=threshold,
|
||||||
max_positive=max_positive,
|
max_positive=max_positive,
|
||||||
name=name,
|
name=name,
|
||||||
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
kwargs = dict(kwargs)
|
||||||
|
attr_prefix = "spans_"
|
||||||
|
key = kwargs["spans_key"]
|
||||||
|
kwargs.setdefault("attr", f"{attr_prefix}{key}")
|
||||||
|
kwargs.setdefault("allow_overlap", True)
|
||||||
|
kwargs.setdefault(
|
||||||
|
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
|
||||||
|
)
|
||||||
|
kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
|
||||||
|
return Scorer.score_spans(examples, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.spancat_scorer.v1")
|
||||||
|
def make_spancat_scorer():
|
||||||
|
return spancat_score
|
||||||
|
|
||||||
|
|
||||||
class SpanCategorizer(TrainablePipe):
|
class SpanCategorizer(TrainablePipe):
|
||||||
"""Pipeline component to label spans of text.
|
"""Pipeline component to label spans of text.
|
||||||
|
|
||||||
|
@ -157,6 +178,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
spans_key: str = "spans",
|
spans_key: str = "spans",
|
||||||
threshold: float = 0.5,
|
threshold: float = 0.5,
|
||||||
max_positive: Optional[int] = None,
|
max_positive: Optional[int] = None,
|
||||||
|
scorer: Optional[Callable] = spancat_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the span categorizer.
|
"""Initialize the span categorizer.
|
||||||
|
|
||||||
|
@ -172,6 +194,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
self.suggester = suggester
|
self.suggester = suggester
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def key(self) -> str:
|
def key(self) -> str:
|
||||||
|
@ -373,28 +396,6 @@ class SpanCategorizer(TrainablePipe):
|
||||||
else:
|
else:
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/spancategorizer#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "SpanCategorizer.score")
|
|
||||||
self._validate_categories(examples)
|
|
||||||
kwargs = dict(kwargs)
|
|
||||||
attr_prefix = "spans_"
|
|
||||||
kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
|
|
||||||
kwargs.setdefault("labels", self.labels)
|
|
||||||
kwargs.setdefault("multi_label", True)
|
|
||||||
kwargs.setdefault("threshold", self.cfg["threshold"])
|
|
||||||
kwargs.setdefault(
|
|
||||||
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
|
|
||||||
)
|
|
||||||
kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
|
|
||||||
return Scorer.score_spans(examples, **kwargs)
|
|
||||||
|
|
||||||
def _validate_categories(self, examples):
|
def _validate_categories(self, examples):
|
||||||
# TODO
|
# TODO
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
|
from typing import Callable, Optional
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
|
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
|
||||||
|
@ -18,6 +19,7 @@ from ..parts_of_speech import X
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
|
from ..util import registry
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -41,10 +43,10 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"tagger",
|
"tagger",
|
||||||
assigns=["token.tag"],
|
assigns=["token.tag"],
|
||||||
default_config={"model": DEFAULT_TAGGER_MODEL},
|
default_config={"model": DEFAULT_TAGGER_MODEL, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
|
||||||
default_score_weights={"tag_acc": 1.0},
|
default_score_weights={"tag_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_tagger(nlp: Language, name: str, model: Model):
|
def make_tagger(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
|
||||||
"""Construct a part-of-speech tagger component.
|
"""Construct a part-of-speech tagger component.
|
||||||
|
|
||||||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||||
|
@ -52,7 +54,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
|
||||||
in size, and be normalized as probabilities (all scores between 0 and 1,
|
in size, and be normalized as probabilities (all scores between 0 and 1,
|
||||||
with the rows summing to 1).
|
with the rows summing to 1).
|
||||||
"""
|
"""
|
||||||
return Tagger(nlp.vocab, model, name)
|
return Tagger(nlp.vocab, model, name, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
|
def tagger_score(examples, **kwargs):
|
||||||
|
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.tagger_scorer.v1")
|
||||||
|
def make_tagger_scorer():
|
||||||
|
return tagger_score
|
||||||
|
|
||||||
|
|
||||||
class Tagger(TrainablePipe):
|
class Tagger(TrainablePipe):
|
||||||
|
@ -60,7 +71,7 @@ class Tagger(TrainablePipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger
|
DOCS: https://spacy.io/api/tagger
|
||||||
"""
|
"""
|
||||||
def __init__(self, vocab, model, name="tagger"):
|
def __init__(self, vocab, model, name="tagger", *, scorer=tagger_score):
|
||||||
"""Initialize a part-of-speech tagger.
|
"""Initialize a part-of-speech tagger.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
@ -76,6 +87,7 @@ class Tagger(TrainablePipe):
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": []}
|
cfg = {"labels": []}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -289,15 +301,3 @@ class Tagger(TrainablePipe):
|
||||||
self.cfg["labels"].append(label)
|
self.cfg["labels"].append(label)
|
||||||
self.vocab.strings.add(label)
|
self.vocab.strings.add(label)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def score(self, examples, **kwargs):
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by
|
|
||||||
Scorer.score_token_attr for the attributes "tag".
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "Tagger.score")
|
|
||||||
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
|
||||||
|
|
|
@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
from ..util import registry
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,7 +71,11 @@ subword_features = true
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"textcat",
|
"textcat",
|
||||||
assigns=["doc.cats"],
|
assigns=["doc.cats"],
|
||||||
default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL},
|
default_config={
|
||||||
|
"threshold": 0.5,
|
||||||
|
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"cats_score": 1.0,
|
"cats_score": 1.0,
|
||||||
"cats_score_desc": None,
|
"cats_score_desc": None,
|
||||||
|
@ -86,7 +91,11 @@ subword_features = true
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_textcat(
|
def make_textcat(
|
||||||
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
model: Model[List[Doc], List[Floats2d]],
|
||||||
|
threshold: float,
|
||||||
|
scorer: Optional[Callable],
|
||||||
) -> "TextCategorizer":
|
) -> "TextCategorizer":
|
||||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||||
over a whole document. It can learn one or more labels, and the labels are considered
|
over a whole document. It can learn one or more labels, and the labels are considered
|
||||||
|
@ -96,7 +105,21 @@ def make_textcat(
|
||||||
scores for each category.
|
scores for each category.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
"""
|
"""
|
||||||
return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
|
||||||
|
|
||||||
|
|
||||||
|
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
return Scorer.score_cats(
|
||||||
|
examples,
|
||||||
|
"cats",
|
||||||
|
multi_label=False,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.textcat_scorer.v1")
|
||||||
|
def make_textcat_scorer():
|
||||||
|
return textcat_score
|
||||||
|
|
||||||
|
|
||||||
class TextCategorizer(TrainablePipe):
|
class TextCategorizer(TrainablePipe):
|
||||||
|
@ -106,7 +129,13 @@ class TextCategorizer(TrainablePipe):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
|
self,
|
||||||
|
vocab: Vocab,
|
||||||
|
model: Model,
|
||||||
|
name: str = "textcat",
|
||||||
|
*,
|
||||||
|
threshold: float,
|
||||||
|
scorer: Optional[Callable] = textcat_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a text categorizer for single-label classification.
|
"""Initialize a text categorizer for single-label classification.
|
||||||
|
|
||||||
|
@ -124,6 +153,7 @@ class TextCategorizer(TrainablePipe):
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self) -> Tuple[str]:
|
def labels(self) -> Tuple[str]:
|
||||||
|
@ -354,26 +384,6 @@ class TextCategorizer(TrainablePipe):
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "TextCategorizer.score")
|
|
||||||
self._validate_categories(examples)
|
|
||||||
kwargs.setdefault("threshold", self.cfg["threshold"])
|
|
||||||
kwargs.setdefault("positive_label", self.cfg["positive_label"])
|
|
||||||
return Scorer.score_cats(
|
|
||||||
examples,
|
|
||||||
"cats",
|
|
||||||
labels=self.labels,
|
|
||||||
multi_label=False,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _validate_categories(self, examples: List[Example]):
|
def _validate_categories(self, examples: List[Example]):
|
||||||
"""Check whether the provided examples all have single-label cats annotations."""
|
"""Check whether the provided examples all have single-label cats annotations."""
|
||||||
for ex in examples:
|
for ex in examples:
|
||||||
|
|
|
@ -5,10 +5,11 @@ from thinc.api import Model, Config
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..training import Example, validate_examples, validate_get_examples
|
from ..training import Example, validate_get_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
from ..util import registry
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from .textcat import TextCategorizer
|
from .textcat import TextCategorizer
|
||||||
|
|
||||||
|
@ -70,7 +71,11 @@ subword_features = true
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"textcat_multilabel",
|
"textcat_multilabel",
|
||||||
assigns=["doc.cats"],
|
assigns=["doc.cats"],
|
||||||
default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL},
|
default_config={
|
||||||
|
"threshold": 0.5,
|
||||||
|
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||||
|
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
|
||||||
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"cats_score": 1.0,
|
"cats_score": 1.0,
|
||||||
"cats_score_desc": None,
|
"cats_score_desc": None,
|
||||||
|
@ -86,7 +91,11 @@ subword_features = true
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_multilabel_textcat(
|
def make_multilabel_textcat(
|
||||||
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
model: Model[List[Doc], List[Floats2d]],
|
||||||
|
threshold: float,
|
||||||
|
scorer: Optional[Callable],
|
||||||
) -> "TextCategorizer":
|
) -> "TextCategorizer":
|
||||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||||
over a whole document. It can learn one or more labels, and the labels are considered
|
over a whole document. It can learn one or more labels, and the labels are considered
|
||||||
|
@ -97,7 +106,23 @@ def make_multilabel_textcat(
|
||||||
scores for each category.
|
scores for each category.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Cutoff to consider a prediction "positive".
|
||||||
"""
|
"""
|
||||||
return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold)
|
return MultiLabel_TextCategorizer(
|
||||||
|
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
return Scorer.score_cats(
|
||||||
|
examples,
|
||||||
|
"cats",
|
||||||
|
multi_label=True,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
|
||||||
|
def make_textcat_multilabel_scorer():
|
||||||
|
return textcat_multilabel_score
|
||||||
|
|
||||||
|
|
||||||
class MultiLabel_TextCategorizer(TextCategorizer):
|
class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
|
@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
name: str = "textcat_multilabel",
|
name: str = "textcat_multilabel",
|
||||||
*,
|
*,
|
||||||
threshold: float,
|
threshold: float,
|
||||||
|
scorer: Optional[Callable] = textcat_multilabel_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a text categorizer for multi-label classification.
|
"""Initialize a text categorizer for multi-label classification.
|
||||||
|
|
||||||
|
@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": [], "threshold": threshold}
|
cfg = {"labels": [], "threshold": threshold}
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def initialize(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
|
@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|
||||||
"""Score a batch of examples.
|
|
||||||
|
|
||||||
examples (Iterable[Example]): The examples to score.
|
|
||||||
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/textcategorizer#score
|
|
||||||
"""
|
|
||||||
validate_examples(examples, "MultiLabel_TextCategorizer.score")
|
|
||||||
kwargs.setdefault("threshold", self.cfg["threshold"])
|
|
||||||
return Scorer.score_cats(
|
|
||||||
examples,
|
|
||||||
"cats",
|
|
||||||
labels=self.labels,
|
|
||||||
multi_label=True,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _validate_categories(self, examples: List[Example]):
|
def _validate_categories(self, examples: List[Example]):
|
||||||
"""This component allows any type of single- or multi-label annotations.
|
"""This component allows any type of single- or multi-label annotations.
|
||||||
This method overwrites the more strict one from 'textcat'."""
|
This method overwrites the more strict one from 'textcat'."""
|
||||||
|
|
|
@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
|
||||||
cdef public Vocab vocab
|
cdef public Vocab vocab
|
||||||
cdef public object model
|
cdef public object model
|
||||||
cdef public object cfg
|
cdef public object cfg
|
||||||
|
cdef public object scorer
|
||||||
|
|
|
@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
|
||||||
beam_density=0.0,
|
beam_density=0.0,
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
multitasks=tuple(),
|
multitasks=tuple(),
|
||||||
incorrect_spans_key=None
|
incorrect_spans_key=None,
|
||||||
|
scorer=None,
|
||||||
):
|
):
|
||||||
"""Create a Parser.
|
"""Create a Parser.
|
||||||
|
|
||||||
|
@ -117,6 +118,7 @@ cdef class Parser(TrainablePipe):
|
||||||
self.add_multitask_objective(multitask)
|
self.add_multitask_objective(multitask)
|
||||||
|
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
def __getnewargs_ex__(self):
|
def __getnewargs_ex__(self):
|
||||||
"""This allows pickling the Parser and its keyword-only init arguments"""
|
"""This allows pickling the Parser and its keyword-only init arguments"""
|
||||||
|
|
|
@ -537,7 +537,7 @@ class Scorer:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def score_links(
|
def score_links(
|
||||||
examples: Iterable[Example], *, negative_labels: Iterable[str]
|
examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Returns PRF for predicted links on the entity level.
|
"""Returns PRF for predicted links on the entity level.
|
||||||
To disentangle the performance of the NEL from the NER,
|
To disentangle the performance of the NEL from the NER,
|
||||||
|
@ -711,7 +711,7 @@ class Scorer:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
|
def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
|
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
|
||||||
score_per_type = defaultdict(PRFScore)
|
score_per_type = defaultdict(PRFScore)
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
|
|
|
@ -32,24 +32,6 @@ def pattern_dicts():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("attribute_ruler_patterns")
|
|
||||||
def attribute_ruler_patterns():
|
|
||||||
return [
|
|
||||||
{
|
|
||||||
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
|
|
||||||
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
|
||||||
},
|
|
||||||
# one pattern sets the lemma
|
|
||||||
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
|
||||||
# another pattern sets the morphology
|
|
||||||
{
|
|
||||||
"patterns": [[{"ORTH": "test"}]],
|
|
||||||
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
|
||||||
"index": 0,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tag_map():
|
def tag_map():
|
||||||
return {
|
return {
|
||||||
|
@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
assert doc.has_annotation("LEMMA")
|
assert doc.has_annotation("LEMMA")
|
||||||
assert doc.has_annotation("MORPH")
|
assert doc.has_annotation("MORPH")
|
||||||
nlp.remove_pipe("attribute_ruler")
|
nlp.remove_pipe("attribute_ruler")
|
||||||
|
|
||||||
# initialize with patterns from misc registry
|
# initialize with patterns from misc registry
|
||||||
|
@registry.misc("attribute_ruler_patterns")
|
||||||
|
def attribute_ruler_patterns():
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
|
||||||
|
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
||||||
|
},
|
||||||
|
# one pattern sets the lemma
|
||||||
|
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
||||||
|
# another pattern sets the morphology
|
||||||
|
{
|
||||||
|
"patterns": [[{"ORTH": "test"}]],
|
||||||
|
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
||||||
|
"index": 0,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
nlp.config["initialize"]["components"]["attribute_ruler"] = {
|
||||||
"patterns": {"@misc": "attribute_ruler_patterns"}
|
"patterns": {"@misc": "attribute_ruler_patterns"}
|
||||||
}
|
}
|
||||||
|
@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
|
||||||
assert scores["lemma_acc"] == pytest.approx(0.2)
|
assert scores["lemma_acc"] == pytest.approx(0.2)
|
||||||
# no morphs are set
|
# no morphs are set
|
||||||
assert scores["morph_acc"] is None
|
assert scores["morph_acc"] is None
|
||||||
|
nlp.remove_pipe("attribute_ruler")
|
||||||
|
|
||||||
|
# test with custom scorer
|
||||||
|
@registry.misc("weird_scorer.v1")
|
||||||
|
def make_weird_scorer():
|
||||||
|
def weird_scorer(examples, weird_score, **kwargs):
|
||||||
|
return {"weird_score": weird_score}
|
||||||
|
|
||||||
|
return weird_scorer
|
||||||
|
|
||||||
|
ruler = nlp.add_pipe(
|
||||||
|
"attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
|
||||||
|
)
|
||||||
|
ruler.initialize(lambda: [], patterns=pattern_dicts)
|
||||||
|
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
|
||||||
|
assert scores["weird_score"] == 0.12345
|
||||||
|
assert "token_acc" in scores
|
||||||
|
assert "lemma_acc" not in scores
|
||||||
|
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
|
||||||
|
assert scores["weird_score"] == 0.23456
|
||||||
|
|
||||||
|
|
||||||
def test_attributeruler_rule_order(nlp):
|
def test_attributeruler_rule_order(nlp):
|
||||||
|
|
|
@ -95,6 +95,7 @@ class registry(thinc.registry):
|
||||||
readers = catalogue.create("spacy", "readers", entry_points=True)
|
readers = catalogue.create("spacy", "readers", entry_points=True)
|
||||||
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
|
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
|
||||||
loggers = catalogue.create("spacy", "loggers", entry_points=True)
|
loggers = catalogue.create("spacy", "loggers", entry_points=True)
|
||||||
|
scorers = catalogue.create("spacy", "scorers", entry_points=True)
|
||||||
# These are factories registered via third-party packages and the
|
# These are factories registered via third-party packages and the
|
||||||
# spacy_factories entry point. This registry only exists so we can easily
|
# spacy_factories entry point. This registry only exists so we can easily
|
||||||
# load them via the entry points. The "true" factories are added via the
|
# load them via the entry points. The "true" factories are added via the
|
||||||
|
|
Loading…
Reference in New Issue
Block a user