Remove scores list from config and document

This commit is contained in:
Ines Montani 2020-07-28 11:22:24 +02:00
parent 9b704c3db3
commit 0094cb0d04
6 changed files with 46 additions and 40 deletions

View File

@ -445,9 +445,8 @@ def setup_printer(
def update_meta( def update_meta(
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
) -> None: ) -> None:
score_cols = training["scores"]
nlp.meta["performance"] = {} nlp.meta["performance"] = {}
for metric in score_cols: for metric in training["scores_weights"]:
nlp.meta["performance"][metric] = info["other_scores"][metric] nlp.meta["performance"][metric] = info["other_scores"][metric]
for pipe_name in nlp.pipe_names: for pipe_name in nlp.pipe_names:
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]

View File

@ -34,7 +34,6 @@ seed = 0
accumulate_gradient = 1 accumulate_gradient = 1
use_pytorch_for_gpu_memory = false use_pytorch_for_gpu_memory = false
# Control how scores are printed and checkpoints are evaluated. # Control how scores are printed and checkpoints are evaluated.
scores = ["token_acc", "speed"]
score_weights = {} score_weights = {}
# These settings are invalid for the transformer models. # These settings are invalid for the transformer models.
init_tok2vec = null init_tok2vec = null

View File

@ -224,19 +224,15 @@ class Language:
# We're storing the filled config for each pipeline component and so # We're storing the filled config for each pipeline component and so
# we can populate the config again later # we can populate the config again later
pipeline = {} pipeline = {}
scores = self._config["training"].get("scores", [])
score_weights = [] score_weights = []
for pipe_name in self.pipe_names: for pipe_name in self.pipe_names:
pipe_meta = self.get_pipe_meta(pipe_name) pipe_meta = self.get_pipe_meta(pipe_name)
pipe_config = self.get_pipe_config(pipe_name) pipe_config = self.get_pipe_config(pipe_name)
pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config} pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
scores.extend(pipe_meta.scores)
if pipe_meta.default_score_weights: if pipe_meta.default_score_weights:
score_weights.append(pipe_meta.default_score_weights) score_weights.append(pipe_meta.default_score_weights)
self._config["nlp"]["pipeline"] = self.pipe_names self._config["nlp"]["pipeline"] = self.pipe_names
self._config["components"] = pipeline self._config["components"] = pipeline
self._config["training"]["scores"] = sorted(set(scores))
combined_score_weights = combine_score_weights(score_weights)
self._config["training"]["score_weights"] = combine_score_weights(score_weights) self._config["training"]["score_weights"] = combine_score_weights(score_weights)
if not srsly.is_json_serializable(self._config): if not srsly.is_json_serializable(self._config):
raise ValueError(Errors.E961.format(config=self._config)) raise ValueError(Errors.E961.format(config=self._config))
@ -376,6 +372,12 @@ class Language:
e.g. "token.ent_id". Used for pipeline analyis. e.g. "token.ent_id". Used for pipeline analyis.
retokenizes (bool): Whether the component changes the tokenization. retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis. Used for pipeline analysis.
scores (Iterable[str]): All scores set by the component if it's trainable,
e.g. ["ents_f", "ents_r", "ents_p"].
default_score_weights (Dict[str, float]): The scores to report during
training, and their default weight towards the final score used to
select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline.
func (Optional[Callable]): Factory function if not used as a decorator. func (Optional[Callable]): Factory function if not used as a decorator.
""" """
if not isinstance(name, str): if not isinstance(name, str):
@ -448,6 +450,12 @@ class Language:
e.g. "token.ent_id". Used for pipeline analyis. e.g. "token.ent_id". Used for pipeline analyis.
retokenizes (bool): Whether the component changes the tokenization. retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis. Used for pipeline analysis.
scores (Iterable[str]): All scores set by the component if it's trainable,
e.g. ["ents_f", "ents_r", "ents_p"].
default_score_weights (Dict[str, float]): The scores to report during
training, and their default weight towards the final score used to
select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline.
func (Optional[Callable]): Factory function if not used as a decorator. func (Optional[Callable]): Factory function if not used as a decorator.
""" """
if name is not None and not isinstance(name, str): if name is not None and not isinstance(name, str):
@ -1505,7 +1513,7 @@ class FactoryMeta:
requires: Iterable[str] = tuple() requires: Iterable[str] = tuple()
retokenizes: bool = False retokenizes: bool = False
scores: Iterable[str] = tuple() scores: Iterable[str] = tuple()
default_score_weights: Dict[str, float] = None default_score_weights: Optional[Dict[str, float]] = None # noqa: E704
def _get_config_overrides( def _get_config_overrides(

View File

@ -207,8 +207,7 @@ class ConfigSchemaTraining(BaseModel):
seed: Optional[StrictInt] = Field(..., title="Random seed") seed: Optional[StrictInt] = Field(..., title="Random seed")
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch") use_pytorch_for_gpu_memory: StrictBool = Field(..., title="Allocate memory via PyTorch")
scores: List[StrictStr] = Field(..., title="Score types to be printed in overview") score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Weights of each score type for selecting final model")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size") discard_oversize: StrictBool = Field(..., title="Whether to skip examples longer than batch size")
batch_by: StrictStr = Field(..., title="Batch examples by type") batch_by: StrictStr = Field(..., title="Batch examples by type")

View File

@ -343,10 +343,7 @@ def test_language_factories_invalid():
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
), ),
( ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
[{"a": 0.5, "b": 0.5}, {"b": 1.0}],
{"a": 0.25, "b": 0.75},
),
], ],
) )
def test_language_factories_combine_score_weights(weights, expected): def test_language_factories_combine_score_weights(weights, expected):
@ -371,11 +368,9 @@ def test_language_factories_scores():
meta2 = Language.get_factory_meta(f"{name}2") meta2 = Language.get_factory_meta(f"{name}2")
assert meta2.default_score_weights == weights2 assert meta2.default_score_weights == weights2
nlp = Language() nlp = Language()
nlp._config["training"]["scores"] = ["speed"]
nlp._config["training"]["score_weights"] = {} nlp._config["training"]["score_weights"] = {}
nlp.add_pipe(f"{name}1") nlp.add_pipe(f"{name}1")
nlp.add_pipe(f"{name}2") nlp.add_pipe(f"{name}2")
cfg = nlp.config["training"] cfg = nlp.config["training"]
assert cfg["scores"] == sorted(["speed", *list(weights1.keys()), *list(weights2.keys())])
expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
assert cfg["score_weights"] == expected_weights assert cfg["score_weights"] == expected_weights

View File

@ -42,14 +42,16 @@ decorator. For more details and examples, see the
> Language.component("my_component2", func=my_component) > Language.component("my_component2", func=my_component)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | The name of the component factory. | | `name` | str | The name of the component factory. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> | | `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> | | `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> | | `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | | `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
## Language.factory {#factory tag="classmethod"} ## Language.factory {#factory tag="classmethod"}
@ -87,15 +89,17 @@ examples, see the
> ) > )
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | The name of the component factory. | | `name` | str | The name of the component factory. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. | | `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> | | `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> | | `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> | | `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | | `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
## Language.\_\_init\_\_ {#init tag="method"} ## Language.\_\_init\_\_ {#init tag="method"}
@ -767,10 +771,12 @@ provided by the [`@Language.component`](/api/language#component) or
component is added to the pipeline and stored on the `Language` class for each component is added to the pipeline and stored on the `Language` class for each
component instance and factory instance. component instance and factory instance.
| Name | Type | Description | | Name | Type | Description |
| ---------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory` | str | The name of the registered component factory. | | `factory` | str | The name of the registered component factory. |
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. | | `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> | | `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  | | `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->  | | `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->  |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |