Remove scores list from config and document

This commit is contained in:
Ines Montani 2020-07-28 11:22:24 +02:00
parent 9b704c3db3
commit 0094cb0d04
6 changed files with 46 additions and 40 deletions

View File

@ -445,9 +445,8 @@ def setup_printer(
def update_meta(
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
) -> None:
score_cols = training["scores"]
nlp.meta["performance"] = {}
for metric in score_cols:
for metric in training["scores_weights"]:
nlp.meta["performance"][metric] = info["other_scores"][metric]
for pipe_name in nlp.pipe_names:
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]

View File

@ -34,7 +34,6 @@ seed = 0
accumulate_gradient = 1
use_pytorch_for_gpu_memory = false
# Control how scores are printed and checkpoints are evaluated.
scores = ["token_acc", "speed"]
score_weights = {}
# These settings are invalid for the transformer models.
init_tok2vec = null

View File

@ -224,19 +224,15 @@ class Language:
# We're storing the filled config for each pipeline component and so
# we can populate the config again later
pipeline = {}
scores = self._config["training"].get("scores", [])
score_weights = []
for pipe_name in self.pipe_names:
pipe_meta = self.get_pipe_meta(pipe_name)
pipe_config = self.get_pipe_config(pipe_name)
pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
scores.extend(pipe_meta.scores)
if pipe_meta.default_score_weights:
score_weights.append(pipe_meta.default_score_weights)
self._config["nlp"]["pipeline"] = self.pipe_names
self._config["components"] = pipeline
self._config["training"]["scores"] = sorted(set(scores))
combined_score_weights = combine_score_weights(score_weights)
self._config["training"]["score_weights"] = combine_score_weights(score_weights)
if not srsly.is_json_serializable(self._config):
raise ValueError(Errors.E961.format(config=self._config))
@ -376,6 +372,12 @@ class Language:
e.g. "token.ent_id". Used for pipeline analyis.
retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis.
scores (Iterable[str]): All scores set by the component if it's trainable,
e.g. ["ents_f", "ents_r", "ents_p"].
default_score_weights (Dict[str, float]): The scores to report during
training, and their default weight towards the final score used to
select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline.
func (Optional[Callable]): Factory function if not used as a decorator.
"""
if not isinstance(name, str):
@ -448,6 +450,12 @@ class Language:
e.g. "token.ent_id". Used for pipeline analyis.
retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis.
scores (Iterable[str]): All scores set by the component if it's trainable,
e.g. ["ents_f", "ents_r", "ents_p"].
default_score_weights (Dict[str, float]): The scores to report during
training, and their default weight towards the final score used to
select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline.
func (Optional[Callable]): Factory function if not used as a decorator.
"""
if name is not None and not isinstance(name, str):
@ -1505,7 +1513,7 @@ class FactoryMeta:
requires: Iterable[str] = tuple()
retokenizes: bool = False
scores: Iterable[str] = tuple()
default_score_weights: Dict[str, float] = None
default_score_weights: Optional[Dict[str, float]] = None # noqa: E704
def _get_config_overrides(

View File

@ -207,8 +207,7 @@ class ConfigSchemaTraining(BaseModel):
seed: Optional[StrictInt] = Field(..., title="Random seed")
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
scores: List[StrictStr] = Field(..., title="Score types to be printed in overview")
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Weights of each score type for selecting final model")
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size")
batch_by: StrictStr = Field(..., title="Batch examples by type")

View File

@ -343,10 +343,7 @@ def test_language_factories_invalid():
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
),
(
[{"a": 0.5, "b": 0.5}, {"b": 1.0}],
{"a": 0.25, "b": 0.75},
),
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
],
)
def test_language_factories_combine_score_weights(weights, expected):
@ -371,11 +368,9 @@ def test_language_factories_scores():
meta2 = Language.get_factory_meta(f"{name}2")
assert meta2.default_score_weights == weights2
nlp = Language()
nlp._config["training"]["scores"] = ["speed"]
nlp._config["training"]["score_weights"] = {}
nlp.add_pipe(f"{name}1")
nlp.add_pipe(f"{name}2")
cfg = nlp.config["training"]
assert cfg["scores"] == sorted(["speed", *list(weights1.keys()), *list(weights2.keys())])
expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
assert cfg["score_weights"] == expected_weights

View File

@ -43,12 +43,14 @@ decorator. For more details and examples, see the
> ```
| Name | Type | Description |
| -------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | The name of the component factory. |
| _keyword-only_ | | |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
## Language.factory {#factory tag="classmethod"}
@ -88,13 +90,15 @@ examples, see the
> ```
| Name | Type | Description |
| ---------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | The name of the component factory. |
| _keyword-only_ | | |
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
## Language.\_\_init\_\_ {#init tag="method"}
@ -768,9 +772,11 @@ component is added to the pipeline and stored on the `Language` class for each
component instance and factory instance.
| Name | Type | Description |
| ---------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory` | str | The name of the registered component factory. |
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->  |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |