Remove scores list from config and document

2025-12-16 14:44:19 +03:00 · 2020-07-28 11:22:24 +02:00 · 2020-07-28 11:22:24 +02:00 · 0094cb0d04
commit 0094cb0d04
parent 9b704c3db3
6 changed files with 46 additions and 40 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -445,9 +445,8 @@ def setup_printer(
 def update_meta(
    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 ) -> None:
    score_cols = training["scores"]
    nlp.meta["performance"] = {}
-    for metric in score_cols:
+    for metric in training["scores_weights"]:
        nlp.meta["performance"][metric] = info["other_scores"][metric]
    for pipe_name in nlp.pipe_names:
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -34,7 +34,6 @@ seed = 0
 accumulate_gradient = 1
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
 scores = ["token_acc", "speed"]
 score_weights = {}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
--- a/spacy/language.py
+++ b/spacy/language.py
@ -224,19 +224,15 @@ class Language:
        # We're storing the filled config for each pipeline component and so
        # we can populate the config again later
        pipeline = {}
        scores = self._config["training"].get("scores", [])
        score_weights = []
        for pipe_name in self.pipe_names:
            pipe_meta = self.get_pipe_meta(pipe_name)
            pipe_config = self.get_pipe_config(pipe_name)
            pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
            scores.extend(pipe_meta.scores)
            if pipe_meta.default_score_weights:
                score_weights.append(pipe_meta.default_score_weights)
        self._config["nlp"]["pipeline"] = self.pipe_names
        self._config["components"] = pipeline
        self._config["training"]["scores"] = sorted(set(scores))
        combined_score_weights = combine_score_weights(score_weights)
        self._config["training"]["score_weights"] = combine_score_weights(score_weights)
        if not srsly.is_json_serializable(self._config):
            raise ValueError(Errors.E961.format(config=self._config))
@ -376,6 +372,12 @@ class Language:
            e.g. "token.ent_id". Used for pipeline analyis.
        retokenizes (bool): Whether the component changes the tokenization.
            Used for pipeline analysis.
        scores (Iterable[str]): All scores set by the component if it's trainable,
            e.g. ["ents_f", "ents_r", "ents_p"].
        default_score_weights (Dict[str, float]): The scores to report during
            training, and their default weight towards the final score used to
            select the best model. Weights should sum to 1.0 per component and
            will be combined and normalized for the whole pipeline.
        func (Optional[Callable]): Factory function if not used as a decorator.
        """
        if not isinstance(name, str):
@ -448,6 +450,12 @@ class Language:
            e.g. "token.ent_id". Used for pipeline analyis.
        retokenizes (bool): Whether the component changes the tokenization.
            Used for pipeline analysis.
        scores (Iterable[str]): All scores set by the component if it's trainable,
            e.g. ["ents_f", "ents_r", "ents_p"].
        default_score_weights (Dict[str, float]): The scores to report during
            training, and their default weight towards the final score used to
            select the best model. Weights should sum to 1.0 per component and
            will be combined and normalized for the whole pipeline.
        func (Optional[Callable]): Factory function if not used as a decorator.
        """
        if name is not None and not isinstance(name, str):
@ -1505,7 +1513,7 @@ class FactoryMeta:
    requires: Iterable[str] = tuple()
    retokenizes: bool = False
    scores: Iterable[str] = tuple()
-    default_score_weights: Dict[str, float] = None
+    default_score_weights: Optional[Dict[str, float]] = None  # noqa: E704
 def _get_config_overrides(
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -207,8 +207,7 @@ class ConfigSchemaTraining(BaseModel):
    seed: Optional[StrictInt] = Field(..., title="Random seed")
    accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
    use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
-    scores: List[StrictStr] = Field(..., title="Score types to be printed in overview")
+    score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
    score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Weights of each score type for selecting final model")
    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
    discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size")
    batch_by: StrictStr = Field(..., title="Batch examples by type")
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@ -343,10 +343,7 @@ def test_language_factories_invalid():
            [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
            {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
        ),
-        (
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
            [{"a": 0.5, "b": 0.5}, {"b": 1.0}],
            {"a": 0.25, "b": 0.75},
        ),
    ],
 )
 def test_language_factories_combine_score_weights(weights, expected):
@ -371,11 +368,9 @@ def test_language_factories_scores():
    meta2 = Language.get_factory_meta(f"{name}2")
    assert meta2.default_score_weights == weights2
    nlp = Language()
    nlp._config["training"]["scores"] = ["speed"]
    nlp._config["training"]["score_weights"] = {}
    nlp.add_pipe(f"{name}1")
    nlp.add_pipe(f"{name}2")
    cfg = nlp.config["training"]
    assert cfg["scores"] == sorted(["speed", *list(weights1.keys()), *list(weights2.keys())])
    expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
    assert cfg["score_weights"] == expected_weights
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -42,14 +42,16 @@ decorator. For more details and examples, see the
 > Language.component("my_component2", func=my_component)
 > ```
-| Name           | Type                 | Description                                                                                                                                   |
+| Name                    | Type                 | Description                                                                                                                                                                                                                 |
-| -------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `name`         | str                  | The name of the component factory.                                                                                                            |
+| `name`                  | str                  | The name of the component factory.                                                                                                                                                                                          |
-| _keyword-only_ |                      |                                                                                                                                               |
+| _keyword-only_          |                      |                                                                                                                                                                                                                             |
-| `assigns`      | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
+| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
-| `requires`     | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
+| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
-| `retokenizes`  | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                      |
+| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
-| `func`         | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                  |
+| `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
 | `default_score_weights` | `Dict[str, float]`   | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
 | `func`                  | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                                                                                                |
 ## Language.factory {#factory tag="classmethod"}
@ -87,15 +89,17 @@ examples, see the
 > )
 > ```
-| Name             | Type                 | Description                                                                                                                                   |
+| Name                    | Type                 | Description                                                                                                                                                                                                                 |
-| ---------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `name`           | str                  | The name of the component factory.                                                                                                            |
+| `name`                  | str                  | The name of the component factory.                                                                                                                                                                                          |
-| _keyword-only_   |                      |                                                                                                                                               |
+| _keyword-only_          |                      |                                                                                                                                                                                                                             |
-| `default_config` | `Dict[str, any]`     | The default config, describing the default values of the factory arguments.                                                                   |
+| `default_config`        | `Dict[str, any]`     | The default config, describing the default values of the factory arguments.                                                                                                                                                 |
-| `assigns`        | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
+| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
-| `requires`       | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
+| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
-| `retokenizes`    | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                      |
+| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
-| `func`           | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                  |
+| `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
 | `default_score_weights` | `Dict[str, float]`   | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
 | `func`                  | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                                                                                                |
 ## Language.\_\_init\_\_ {#init tag="method"}
@ -767,10 +771,12 @@ provided by the [`@Language.component`](/api/language#component) or
 component is added to the pipeline and stored on the `Language` class for each
 component instance and factory instance.
-| Name             | Type             | Description                                                                                                                                    |
+| Name                    | Type               | Description                                                                                                                                                                                                                 |
-| ---------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `factory`        | str              | The name of the registered component factory.                                                                                                  |
+| `factory`               | str                | The name of the registered component factory.                                                                                                                                                                               |
-| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments.                                                                    |
+| `default_config`        | `Dict[str, Any]`   | The default config, describing the default values of the factory arguments.                                                                                                                                                 |
-| `assigns`        | `Iterable[str]`  | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  |
+| `assigns`               | `Iterable[str]`    | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
-| `requires`       | `Iterable[str]`  | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  |
+| `requires`              | `Iterable[str]`    | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->                                                                               |
-| `retokenizes`    | bool             | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                       |
+| `retokenizes`           | bool               | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->                                                                                                                    |
 | `scores`                | `Iterable[str]`    | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
 | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |