Merge remote-tracking branch 'upstream/develop' into feature/missing-dep

2025-11-04 01:48:04 +03:00 · 2021-01-13 17:46:12 +01:00 · 2021-01-13 17:46:12 +01:00 · fec9b81aa2
commit fec9b81aa2
parent ed53bb979d 31a92b28ae
11 changed files with 229 additions and 41 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -124,3 +124,5 @@ lookups = null
 tokenizer = {}
 # Arguments for initialize methods of the components (keyed by component)
 components = {}
+before_init = null
+after_init = null
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1209,6 +1209,9 @@ class Language:
        config = self.config.interpolate()
        # These are the settings provided in the [initialize] block in the config
        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
+        before_init = I["before_init"]
+        if before_init is not None:
+            before_init(self)
        init_vocab(
            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
        )
@ -1240,6 +1243,9 @@ class Language:
            self._optimizer = sgd
        elif self._optimizer is None:
            self._optimizer = self.create_optimizer()
+        after_init = I["after_init"]
+        if after_init is not None:
+            after_init(self)
        return self._optimizer

    def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -256,8 +256,14 @@ class Tagger(TrainablePipe):
        DOCS: https://nightly.spacy.io/api/tagger#get_loss
        """
        validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, missing_value="")
-        truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
+        # Convert empty tag "" to missing value None so that both misaligned
+        # tokens and tokens with missing annotation have the default missing
+        # value None.
+        truths = []
+        for eg in examples:
+            eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
+            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
        if self.model.ops.xp.isnan(loss):
            raise ValueError(Errors.E910.format(name=self.name))
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
    tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
+    before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
+    after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
    # fmt: on

    class Config:
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -37,7 +37,16 @@ TRAIN_DATA = [
 ]

 PARTIAL_DATA = [
+    # partial annotation
    ("I like green eggs", {"tags": ["", "V", "J", ""]}),
+    # misaligned partial annotation
+    (
+        "He hates green eggs",
+        {
+            "words": ["He", "hate", "s", "green", "eggs"],
+            "tags": ["", "V", "S", "J", ""],
+        },
+    ),
 ]


--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -8,6 +8,7 @@ from spacy.util import registry, load_model_from_config, load_config
 from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
 from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
 from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
+from catalogue import RegistryError


 from ..util import make_tempdir
@ -446,3 +447,21 @@ def test_config_validate_literal(parser_config_string):
        nlp.add_pipe("parser", config=config)
    config["model"]["state_type"] = "ner"
    nlp.add_pipe("parser", config=config)
+
+
+def test_config_only_resolve_relevant_blocks():
+    """Test that only the relevant blocks are resolved in the different methods
+    and that invalid blocks are ignored if needed. For instance, the [initialize]
+    shouldn't be resolved at runtime.
+    """
+    nlp = English()
+    config = nlp.config
+    config["training"]["before_to_disk"] = {"@misc": "nonexistent"}
+    config["initialize"]["lookups"] = {"@misc": "nonexistent"}
+    # This shouldn't resolve [training] or [initialize]
+    nlp = load_model_from_config(config, auto_fill=True)
+    # This will raise for nonexistent value
+    with pytest.raises(RegistryError):
+        nlp.initialize()
+    nlp.config["initialize"]["lookups"] = None
+    nlp.initialize()
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
    ran_before = False
    ran_after = False
    ran_after_pipeline = False
+    ran_before_init = False
+    ran_after_init = False

    @registry.callbacks(f"{name}_before")
    def make_before_creation():
@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():

        return after_pipeline_creation

+    @registry.callbacks(f"{name}_before_init")
+    def make_before_init():
+        def before_init(nlp):
+            nonlocal ran_before_init
+            ran_before_init = True
+            nlp.meta["before_init"] = "before"
+            return nlp
+
+        return before_init
+
+    @registry.callbacks(f"{name}_after_init")
+    def make_after_init():
+        def after_init(nlp):
+            nonlocal ran_after_init
+            ran_after_init = True
+            nlp.meta["after_init"] = "after"
+            return nlp
+
+        return after_init
+
    config = {
        "nlp": {
            "pipeline": ["sentencizer"],
@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
            "after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
        },
        "components": {"sentencizer": {"factory": "sentencizer"}},
+        "initialize": {
+            "before_init": {"@callbacks": f"{name}_before_init"},
+            "after_init": {"@callbacks": f"{name}_after_init"},
+        },
    }
    nlp = English.from_config(config)
-    assert all([ran_before, ran_after, ran_after_pipeline])
    assert nlp.Defaults.foo == "bar"
    assert nlp.meta["foo"] == "bar"
    assert nlp.meta["bar"] == "baz"
+    assert "before_init" not in nlp.meta
+    assert "after_init" not in nlp.meta
    assert nlp.pipe_names == ["sentencizer"]
    assert nlp("text")
+    nlp.initialize()
+    assert nlp.meta["before_init"] == "before"
+    assert nlp.meta["after_init"] == "after"
+    assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])


 def test_language_from_config_before_after_init_invalid():
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -59,6 +59,19 @@ def train(
    batcher = T["batcher"]
    train_logger = T["logger"]
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+
+    # Helper function to save checkpoints. This is a closure for convenience,
+    # to avoid passing in all the args all the time.
+    def save_checkpoint(is_best):
+        with nlp.use_params(optimizer.averages):
+            before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST)
+        if is_best:
+            # Avoid saving twice (saving will be more expensive than
+            # the dir copy)
+            if (output_path / DIR_MODEL_BEST).exists():
+                shutil.rmtree(output_path / DIR_MODEL_BEST)
+            shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST)
+
    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
    # Create iterator, which yields out info after each optimization step.
@ -87,40 +100,31 @@ def train(
            if is_best_checkpoint is not None and output_path is not None:
                with nlp.select_pipes(disable=frozen_components):
                    update_meta(T, nlp, info)
-                    with nlp.use_params(optimizer.averages):
-                        nlp = before_to_disk(nlp)
-                        nlp.to_disk(output_path / DIR_MODEL_LAST)
-                    if is_best_checkpoint:
-                        with nlp.use_params(optimizer.averages):
-                            nlp.to_disk(output_path / DIR_MODEL_BEST)
-
+                save_checkpoint(is_best_checkpoint)
    except Exception as e:
        if output_path is not None:
-            # We don't want to swallow the traceback if we don't have a
-            # specific error, but we do want to warn that we're trying
-            # to do something here.
            stdout.write(
                msg.warn(
                    f"Aborting and saving the final best model. "
-                    f"Encountered exception: {str(e)}"
+                    f"Encountered exception: {repr(e)}"
                )
                + "\n"
            )
        raise e
    finally:
        finalize_logger()
-        if optimizer.averages:
-            nlp.use_params(optimizer.averages)
-        if output_path is not None:
-            final_model_path = output_path / DIR_MODEL_LAST
-            nlp.to_disk(final_model_path)
-            # This will only run if we don't hit an error
-            stdout.write(
-                msg.good("Saved pipeline to output directory", final_model_path) + "\n"
-            )
-            return (nlp, final_model_path)
-        else:
-            return (nlp, None)
+        save_checkpoint(False)
+    # This will only run if we did't hit an error
+    if optimizer.averages:
+        nlp.use_params(optimizer.averages)
+    if output_path is not None:
+        stdout.write(
+            msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST)
+            + "\n"
+        )
+        return (nlp, output_path / DIR_MODEL_LAST)
+    else:
+        return (nlp, None)


 def train_while_improving(
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -716,7 +716,7 @@ that we want to classify as being related or not. As these candidate pairs are
 typically formed within one document, this function takes a [`Doc`](/api/doc) as
 input and outputs a `List` of `Span` tuples. For instance, the following
 implementation takes any two entities from the same document, as long as they
-are within a **maximum distance** (in number of tokens) of eachother:
+are within a **maximum distance** (in number of tokens) of each other:

 > #### config.cfg (excerpt)
 >
@ -742,7 +742,7 @@ def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]
    return get_candidates
 ```

-This function in added to the [`@misc` registry](/api/top-level#registry) so we
+This function is added to the [`@misc` registry](/api/top-level#registry) so we
 can refer to it from the config, and easily swap it out for any other candidate
 generation function.

--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal
 modifications, like adjusting the
 [tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
 [language defaults](/api/language#defaults) like stop words. The config lets you
-provide three optional **callback functions** that give you access to the
+provide five optional **callback functions** that give you access to the
 language class and `nlp` object at different points of the lifecycle:

-| Callback                  | Description                                                                                                                                                                              |
-| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
-| `after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer.          |
-| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                |
+| Callback                      | Description                                                                                                                                                                                                                |
+| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `nlp.before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
+| `nlp.after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object.                                                                                |
+| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                                                  |
+| `initialize.before_init`      | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option.                           |
+| `initialize.after_init`       | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification.                                                                                                              |

 The `@spacy.registry.callbacks` decorator lets you register your custom function
 in the `callbacks` [registry](/api/top-level#registry) under a given name. You
@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
 a block contains a key starting with an `@`, it's interpreted as a reference to
 a function. Because you've registered the function, spaCy knows how to create it
 when you reference `"customize_language_data"` in your config. Here's an example
-of a callback that runs before the `nlp` object is created and adds a few custom
-tokenization rules to the defaults:
+of a callback that runs before the `nlp` object is created and adds a custom
+stop word to the defaults:

 > #### config.cfg
 >
@ -643,7 +645,7 @@ import spacy
@spacy.registry.callbacks("customize_language_data")
 def create_callback():
    def customize_language_data(lang_cls):
-        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
+        lang_cls.Defaults.stop_words.add("good")
        return lang_cls

    return customize_language_data
@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
 > ```

 ```python
-### functions.py {highlight="5,8-10"}
+### functions.py {highlight="5,7-9"}
 from typing import List
 import spacy

@spacy.registry.callbacks("customize_language_data")
 def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
    def customize_language_data(lang_cls):
-        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
-        lang_cls.Defaults.stop_words.add(extra_stop_words)
+        lang_cls.Defaults.stop_words.update(extra_stop_words)
        if debug:
-            print("Updated stop words and tokenizer suffixes")
+            print("Updated stop words")
        return lang_cls

    return customize_language_data
@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
 $ python -m spacy train config.cfg --output ./output --code ./functions.py
 ```

+#### Example: Modifying tokenizer settings {#custom-tokenizer}
+
+Use the `initialize.before_init` callback to modify the tokenizer settings when
+training a new pipeline. Write a registered callback that modifies the tokenizer
+settings and specify this callback in your config:
+
+> #### config.cfg
+>
+> ```ini
+> [initialize]
+>
+> [initialize.before_init]
+> @callbacks = "customize_tokenizer"
+> ```
+
+```python
+### functions.py
+from spacy.util import registry, compile_suffix_regex
+
+@registry.callbacks("customize_tokenizer")
+def make_customize_tokenizer():
+    def customize_tokenizer(nlp):
+        # remove a suffix
+        suffixes = list(nlp.Defaults.suffixes)
+        suffixes.remove("\\[")
+        suffix_regex = compile_suffix_regex(suffixes)
+        nlp.tokenizer.suffix_search = suffix_regex.search
+
+        # add a special case
+        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
+    return customize_tokenizer
+```
+
+When training, provide the function above with the `--code` option:
+
+```cli
+$ python -m spacy train config.cfg --code ./functions.py
+```
+
+Because this callback is only called in the one-time initialization step before
+training, the callback code does not need to be packaged with the final pipeline
+package. However, to make it easier for others to replicate your training setup,
+you can choose to package the initialization callbacks with the pipeline package
+or to publish them separately.
+
+<Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
+
+- `nlp.before_creation` is the best place to modify language defaults other than
+  the tokenizer settings.
+- `initialize.before_init` is the best place to modify tokenizer settings when
+  training a new pipeline.
+
+Unlike the other language defaults, the tokenizer settings are saved with the
+pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
+will be clobbered by the saved settings when the trained pipeline is loaded from
+disk.
+
+</Infobox>
+
 #### Example: Custom logging function {#custom-logging}

 During training, the results of each step are passed to a logger function. By
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -930,6 +930,55 @@ treebank.

 </Project>

+#### Modifying tokenizer settings
+
+If you were using a base model with `spacy train` to customize the tokenizer
+settings in v2, your modifications can be provided in the
+`[initialize.before_init]` callback.
+
+Write a registered callback that modifies the tokenizer settings and specify
+this callback in your config:
+
+> #### config.cfg
+>
+> ```ini
+> [initialize]
+>
+> [initialize.before_init]
+> @callbacks = "customize_tokenizer"
+> ```
+
+```python
+### functions.py
+from spacy.util import registry, compile_suffix_regex
+
+@registry.callbacks("customize_tokenizer")
+def make_customize_tokenizer():
+    def customize_tokenizer(nlp):
+        # remove a suffix
+        suffixes = list(nlp.Defaults.suffixes)
+        suffixes.remove("\\[")
+        suffix_regex = compile_suffix_regex(suffixes)
+        nlp.tokenizer.suffix_search = suffix_regex.search
+
+        # add a special case
+        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
+    return customize_tokenizer
+```
+
+When training, provide the function above with the `--code` option:
+
+```cli
+$ python -m spacy train config.cfg --code ./functions.py
+```
+
+The train step requires the `--code` option with your registered functions from
+the `[initialize]` block, but since those callbacks are only required during the
+initialization step, you don't need to provide them with the final pipeline
+package. However, to make it easier for others to replicate your training setup,
+you can choose to package the initialization callbacks with the pipeline package
+or to publish them separately.
+
 #### Training via the Python API {#migrating-training-python}

 For most use cases, you **shouldn't** have to write your own training scripts