Allow pipeline components to set default scores and weights

This commit is contained in:
Ines Montani 2020-07-26 13:18:43 +02:00
parent 787d066e22
commit 2470486543
8 changed files with 100 additions and 8 deletions

View File

@ -34,8 +34,8 @@ seed = 0
accumulate_gradient = 1
use_pytorch_for_gpu_memory = false
# Control how scores are printed and checkpoints are evaluated.
scores = ["speed", "tag_acc", "dep_uas", "dep_las", "ents_f"]
score_weights = {"tag_acc": 0.2, "dep_las": 0.4, "ents_f": 0.4}
scores = ["token_acc", "speed"]
score_weights = {}
# These settings are invalid for the transformer models.
init_tok2vec = null
discard_oversize = false

View File

@ -21,7 +21,7 @@ from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
from .gold import Example
from .scorer import Scorer
from .util import link_vectors_to_models, create_default_optimizer, registry
from .util import SimpleFrozenDict
from .util import SimpleFrozenDict, combine_score_weights
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
@ -218,16 +218,24 @@ class Language:
@property
def config(self) -> Config:
self._config.setdefault("nlp", {})
self._config.setdefault("training", {})
self._config["nlp"]["lang"] = self.lang
# We're storing the filled config for each pipeline component and so
# we can populate the config again later
pipeline = {}
scores = self._config["training"].get("scores", [])
score_weights = []
for pipe_name in self.pipe_names:
pipe_meta = self.get_pipe_meta(pipe_name)
pipe_config = self.get_pipe_config(pipe_name)
pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
scores.extend(pipe_meta.scores)
if pipe_meta.score_weights:
score_weights.append(pipe_meta.score_weights)
self._config["nlp"]["pipeline"] = self.pipe_names
self._config["components"] = pipeline
self._config["training"]["scores"] = list(scores)
self._config["training"]["score_weights"] = combine_score_weights(score_weights)
if not srsly.is_json_serializable(self._config):
raise ValueError(Errors.E961.format(config=self._config))
return self._config
@ -348,6 +356,8 @@ class Language:
assigns: Iterable[str] = tuple(),
requires: Iterable[str] = tuple(),
retokenizes: bool = False,
scores: Iterable[str] = tuple(),
score_weights: Dict[str, float] = SimpleFrozenDict(),
func: Optional[Callable] = None,
) -> Callable:
"""Register a new pipeline component factory. Can be used as a decorator
@ -393,6 +403,8 @@ class Language:
default_config=default_config,
assigns=validate_attrs(assigns),
requires=validate_attrs(requires),
scores=scores,
score_weights=score_weights,
retokenizes=retokenizes,
)
cls.set_factory_meta(name, factory_meta)
@ -417,6 +429,8 @@ class Language:
assigns: Iterable[str] = tuple(),
requires: Iterable[str] = tuple(),
retokenizes: bool = False,
scores: Iterable[str] = tuple(),
score_weights: Dict[str, float] = SimpleFrozenDict(),
func: Optional[Callable[[Doc], Doc]] = None,
) -> Callable:
"""Register a new pipeline component. Can be used for stateless function
@ -450,6 +464,8 @@ class Language:
assigns=assigns,
requires=requires,
retokenizes=retokenizes,
scores=scores,
score_weights=score_weights,
func=factory_func,
)
return component_func
@ -1484,6 +1500,8 @@ class FactoryMeta:
assigns: Iterable[str] = tuple()
requires: Iterable[str] = tuple()
retokenizes: bool = False
scores: Iterable[str] = tuple()
score_weights: Dict[str, float] = None
def _get_config_overrides(

View File

@ -42,7 +42,9 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
}
},
scores=["dep_uas", "dep_las", "sents_f"],
score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0},
)
def make_parser(
nlp: Language,

View File

@ -40,7 +40,9 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_NER_MODEL,
}
},
scores=["ents_f", "ents_r", "ents_p"],
score_weights={"ents_f": 1.0, "ents_r": 0.0, "ents_p": 0.0},
)
def make_ner(
nlp: Language,

View File

@ -33,7 +33,9 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"senter",
assigns=["token.is_sent_start"],
default_config={"model": DEFAULT_SENTER_MODEL}
default_config={"model": DEFAULT_SENTER_MODEL},
scores=["sents_p", "sents_r", "sents_f"],
score_weights={"sents_p": 0.0, "sents_r": 0.0, "sents_f": 1.0},
)
def make_senter(nlp: Language, name: str, model: Model):
return SentenceRecognizer(nlp.vocab, model, name)

View File

@ -39,7 +39,9 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"tagger",
assigns=["token.tag"],
default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False}
default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False},
scores=["tag_acc", "pos_acc"],
score_weights={"tag_acc": 0.5, "pos_acc": 0.5},
)
def make_tagger(nlp: Language, name: str, model: Model, set_morphology: bool):
return Tagger(nlp.vocab, model, name, set_morphology=set_morphology)

View File

@ -3,7 +3,7 @@ from spacy.language import Language
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.tokens import Doc
from spacy.util import registry, SimpleFrozenDict
from spacy.util import registry, SimpleFrozenDict, combine_score_weights
from thinc.api import Model, Linear
from thinc.config import ConfigValidationError
from pydantic import StrictInt, StrictStr
@ -328,3 +328,52 @@ def test_language_factories_invalid():
assert len(nlp.factories)
with pytest.raises(NotImplementedError):
nlp.factories["foo"] = "bar"
@pytest.mark.parametrize(
"weights,expected",
[
([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
(
[{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
{"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
),
(
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
),
],
)
def test_language_factories_combine_score_weights(weights, expected):
result = combine_score_weights(weights)
assert sum(result.values()) in (0.99, 1.0)
assert result == expected
def test_language_factories_scores():
name = "test_language_factories_scores"
func = lambda doc: doc
scores1 = ["a1", "a2"]
weights1 = {"a1": 0.5, "a2": 0.5}
scores2 = ["b1", "b2", "b3"]
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
Language.component(
f"{name}1", scores=scores1, score_weights=weights1, func=func,
)
Language.component(
f"{name}2", scores=scores2, score_weights=weights2, func=func,
)
meta1 = Language.get_factory_meta(f"{name}1")
assert meta1.scores == scores1
assert meta1.score_weights == weights1
meta2 = Language.get_factory_meta(f"{name}2")
assert meta2.scores == scores2
assert meta2.score_weights == weights2
nlp = Language(config={"training": {"scores": ["speed"], "score_weights": {}}})
nlp.add_pipe(f"{name}1")
nlp.add_pipe(f"{name}2")
cfg = nlp.config["training"]
assert cfg["scores"] == ["speed", *scores1, *scores2]
expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
assert cfg["score_weights"] == expected_weights

View File

@ -1130,6 +1130,23 @@ def get_arg_names(func: Callable) -> List[str]:
return list(set([*argspec.args, *argspec.kwonlyargs]))
def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
"""Combine and normalize score weights defined by components, e.g.
{"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
weights (List[dict]): The weights defined by the components.
RETURNS (Dict[str, float]): The combined and normalized weights.
"""
result = {}
for w_dict in weights:
# We need to account for weights that don't sum to 1.0 and normalize the
# score weights accordingly, then divide score by the number of components
total = sum([w for w in w_dict.values()])
for key, value in w_dict.items():
result[key] = round(value / total / len(weights), 2)
return result
class DummyTokenizer:
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)