Merge remote-tracking branch 'upstream/develop' into feature/missing-dep

2025-11-03 09:27:56 +03:00 · 2021-01-13 17:46:12 +01:00 · 2021-01-13 17:46:12 +01:00 · fec9b81aa2
commit fec9b81aa2
parent ed53bb979d 31a92b28ae
11 changed files with 229 additions and 41 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -124,3 +124,5 @@ lookups = null
 tokenizer = {}
 # Arguments for initialize methods of the components (keyed by component)
 components = {}
 before_init = null
 after_init = null
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1209,6 +1209,9 @@ class Language:
        config = self.config.interpolate()
        # These are the settings provided in the [initialize] block in the config
        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
        before_init = I["before_init"]
        if before_init is not None:
            before_init(self)
        init_vocab(
            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
        )
@ -1240,6 +1243,9 @@ class Language:
            self._optimizer = sgd
        elif self._optimizer is None:
            self._optimizer = self.create_optimizer()
        after_init = I["after_init"]
        if after_init is not None:
            after_init(self)
        return self._optimizer
    def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -256,8 +256,14 @@ class Tagger(TrainablePipe):
        DOCS: https://nightly.spacy.io/api/tagger#get_loss
        """
        validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, missing_value="")
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
-        truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
+        # Convert empty tag "" to missing value None so that both misaligned
        # tokens and tokens with missing annotation have the default missing
        # value None.
        truths = []
        for eg in examples:
            eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
        if self.model.ops.xp.isnan(loss):
            raise ValueError(Errors.E910.format(name=self.name))
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
    tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
    before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
    after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
    # fmt: on
    class Config:
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -37,7 +37,16 @@ TRAIN_DATA = [
 ]
 PARTIAL_DATA = [
    # partial annotation
    ("I like green eggs", {"tags": ["", "V", "J", ""]}),
    # misaligned partial annotation
    (
        "He hates green eggs",
        {
            "words": ["He", "hate", "s", "green", "eggs"],
            "tags": ["", "V", "S", "J", ""],
        },
    ),
 ]
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -8,6 +8,7 @@ from spacy.util import registry, load_model_from_config, load_config
 from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
 from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
 from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
 from catalogue import RegistryError
 from ..util import make_tempdir
@ -446,3 +447,21 @@ def test_config_validate_literal(parser_config_string):
        nlp.add_pipe("parser", config=config)
    config["model"]["state_type"] = "ner"
    nlp.add_pipe("parser", config=config)
 def test_config_only_resolve_relevant_blocks():
    """Test that only the relevant blocks are resolved in the different methods
    and that invalid blocks are ignored if needed. For instance, the [initialize]
    shouldn't be resolved at runtime.
    """
    nlp = English()
    config = nlp.config
    config["training"]["before_to_disk"] = {"@misc": "nonexistent"}
    config["initialize"]["lookups"] = {"@misc": "nonexistent"}
    # This shouldn't resolve [training] or [initialize]
    nlp = load_model_from_config(config, auto_fill=True)
    # This will raise for nonexistent value
    with pytest.raises(RegistryError):
        nlp.initialize()
    nlp.config["initialize"]["lookups"] = None
    nlp.initialize()
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
    ran_before = False
    ran_after = False
    ran_after_pipeline = False
    ran_before_init = False
    ran_after_init = False
    @registry.callbacks(f"{name}_before")
    def make_before_creation():
@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():
        return after_pipeline_creation
    @registry.callbacks(f"{name}_before_init")
    def make_before_init():
        def before_init(nlp):
            nonlocal ran_before_init
            ran_before_init = True
            nlp.meta["before_init"] = "before"
            return nlp
        return before_init
    @registry.callbacks(f"{name}_after_init")
    def make_after_init():
        def after_init(nlp):
            nonlocal ran_after_init
            ran_after_init = True
            nlp.meta["after_init"] = "after"
            return nlp
        return after_init
    config = {
        "nlp": {
            "pipeline": ["sentencizer"],
@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
            "after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
        },
        "components": {"sentencizer": {"factory": "sentencizer"}},
        "initialize": {
            "before_init": {"@callbacks": f"{name}_before_init"},
            "after_init": {"@callbacks": f"{name}_after_init"},
        },
    }
    nlp = English.from_config(config)
    assert all([ran_before, ran_after, ran_after_pipeline])
    assert nlp.Defaults.foo == "bar"
    assert nlp.meta["foo"] == "bar"
    assert nlp.meta["bar"] == "baz"
    assert "before_init" not in nlp.meta
    assert "after_init" not in nlp.meta
    assert nlp.pipe_names == ["sentencizer"]
    assert nlp("text")
    nlp.initialize()
    assert nlp.meta["before_init"] == "before"
    assert nlp.meta["after_init"] == "after"
    assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
 def test_language_from_config_before_after_init_invalid():
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -59,6 +59,19 @@ def train(
    batcher = T["batcher"]
    train_logger = T["logger"]
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
    # Helper function to save checkpoints. This is a closure for convenience,
    # to avoid passing in all the args all the time.
    def save_checkpoint(is_best):
        with nlp.use_params(optimizer.averages):
            before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST)
        if is_best:
            # Avoid saving twice (saving will be more expensive than
            # the dir copy)
            if (output_path / DIR_MODEL_BEST).exists():
                shutil.rmtree(output_path / DIR_MODEL_BEST)
            shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST)
    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
    # Create iterator, which yields out info after each optimization step.
@ -87,38 +100,29 @@ def train(
            if is_best_checkpoint is not None and output_path is not None:
                with nlp.select_pipes(disable=frozen_components):
                    update_meta(T, nlp, info)
-                    with nlp.use_params(optimizer.averages):
+                save_checkpoint(is_best_checkpoint)
                        nlp = before_to_disk(nlp)
                        nlp.to_disk(output_path / DIR_MODEL_LAST)
                    if is_best_checkpoint:
                        with nlp.use_params(optimizer.averages):
                            nlp.to_disk(output_path / DIR_MODEL_BEST)
    except Exception as e:
        if output_path is not None:
            # We don't want to swallow the traceback if we don't have a
            # specific error, but we do want to warn that we're trying
            # to do something here.
            stdout.write(
                msg.warn(
                    f"Aborting and saving the final best model. "
-                    f"Encountered exception: {str(e)}"
+                    f"Encountered exception: {repr(e)}"
                )
                + "\n"
            )
        raise e
    finally:
        finalize_logger()
        save_checkpoint(False)
    # This will only run if we did't hit an error
    if optimizer.averages:
        nlp.use_params(optimizer.averages)
    if output_path is not None:
            final_model_path = output_path / DIR_MODEL_LAST
            nlp.to_disk(final_model_path)
            # This will only run if we don't hit an error
        stdout.write(
-                msg.good("Saved pipeline to output directory", final_model_path) + "\n"
+            msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST)
            + "\n"
        )
-            return (nlp, final_model_path)
+        return (nlp, output_path / DIR_MODEL_LAST)
    else:
        return (nlp, None)
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -716,7 +716,7 @@ that we want to classify as being related or not. As these candidate pairs are
 typically formed within one document, this function takes a [`Doc`](/api/doc) as
 input and outputs a `List` of `Span` tuples. For instance, the following
 implementation takes any two entities from the same document, as long as they
-are within a **maximum distance** (in number of tokens) of eachother:
+are within a **maximum distance** (in number of tokens) of each other:
 > #### config.cfg (excerpt)
 >
@ -742,7 +742,7 @@ def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]
    return get_candidates
 ```
-This function in added to the [`@misc` registry](/api/top-level#registry) so we
+This function is added to the [`@misc` registry](/api/top-level#registry) so we
 can refer to it from the config, and easily swap it out for any other candidate
 generation function.
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal
 modifications, like adjusting the
 [tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
 [language defaults](/api/language#defaults) like stop words. The config lets you
-provide three optional **callback functions** that give you access to the
+provide five optional **callback functions** that give you access to the
 language class and `nlp` object at different points of the lifecycle:
 | Callback                      | Description                                                                                                                                                                                                                |
-| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
+| `nlp.before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
-| `after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer.          |
+| `nlp.after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object.                                                                                |
-| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                |
+| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                                                  |
 | `initialize.before_init`      | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option.                           |
 | `initialize.after_init`       | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification.                                                                                                              |
 The `@spacy.registry.callbacks` decorator lets you register your custom function
 in the `callbacks` [registry](/api/top-level#registry) under a given name. You
@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
 a block contains a key starting with an `@`, it's interpreted as a reference to
 a function. Because you've registered the function, spaCy knows how to create it
 when you reference `"customize_language_data"` in your config. Here's an example
-of a callback that runs before the `nlp` object is created and adds a few custom
+of a callback that runs before the `nlp` object is created and adds a custom
-tokenization rules to the defaults:
+stop word to the defaults:
 > #### config.cfg
 >
@ -643,7 +645,7 @@ import spacy
@spacy.registry.callbacks("customize_language_data")
 def create_callback():
    def customize_language_data(lang_cls):
-        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
+        lang_cls.Defaults.stop_words.add("good")
        return lang_cls
    return customize_language_data
@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
 > ```
 ```python
-### functions.py {highlight="5,8-10"}
+### functions.py {highlight="5,7-9"}
 from typing import List
 import spacy
@spacy.registry.callbacks("customize_language_data")
 def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
    def customize_language_data(lang_cls):
-        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
+        lang_cls.Defaults.stop_words.update(extra_stop_words)
        lang_cls.Defaults.stop_words.add(extra_stop_words)
        if debug:
-            print("Updated stop words and tokenizer suffixes")
+            print("Updated stop words")
        return lang_cls
    return customize_language_data
@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
 $ python -m spacy train config.cfg --output ./output --code ./functions.py
 ```
 #### Example: Modifying tokenizer settings {#custom-tokenizer}
 Use the `initialize.before_init` callback to modify the tokenizer settings when
 training a new pipeline. Write a registered callback that modifies the tokenizer
 settings and specify this callback in your config:
 > #### config.cfg
 >
 > ```ini
 > [initialize]
 >
 > [initialize.before_init]
 > @callbacks = "customize_tokenizer"
 > ```
 ```python
 ### functions.py
 from spacy.util import registry, compile_suffix_regex
@registry.callbacks("customize_tokenizer")
 def make_customize_tokenizer():
    def customize_tokenizer(nlp):
        # remove a suffix
        suffixes = list(nlp.Defaults.suffixes)
        suffixes.remove("\\[")
        suffix_regex = compile_suffix_regex(suffixes)
        nlp.tokenizer.suffix_search = suffix_regex.search
        # add a special case
        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
    return customize_tokenizer
 ```
 When training, provide the function above with the `--code` option:
 ```cli
 $ python -m spacy train config.cfg --code ./functions.py
 ```
 Because this callback is only called in the one-time initialization step before
 training, the callback code does not need to be packaged with the final pipeline
 package. However, to make it easier for others to replicate your training setup,
 you can choose to package the initialization callbacks with the pipeline package
 or to publish them separately.
 <Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
 - `nlp.before_creation` is the best place to modify language defaults other than
  the tokenizer settings.
 - `initialize.before_init` is the best place to modify tokenizer settings when
  training a new pipeline.
 Unlike the other language defaults, the tokenizer settings are saved with the
 pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
 will be clobbered by the saved settings when the trained pipeline is loaded from
 disk.
 </Infobox>
 #### Example: Custom logging function {#custom-logging}
 During training, the results of each step are passed to a logger function. By
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -930,6 +930,55 @@ treebank.
 </Project>
 #### Modifying tokenizer settings
 If you were using a base model with `spacy train` to customize the tokenizer
 settings in v2, your modifications can be provided in the
 `[initialize.before_init]` callback.
 Write a registered callback that modifies the tokenizer settings and specify
 this callback in your config:
 > #### config.cfg
 >
 > ```ini
 > [initialize]
 >
 > [initialize.before_init]
 > @callbacks = "customize_tokenizer"
 > ```
 ```python
 ### functions.py
 from spacy.util import registry, compile_suffix_regex
@registry.callbacks("customize_tokenizer")
 def make_customize_tokenizer():
    def customize_tokenizer(nlp):
        # remove a suffix
        suffixes = list(nlp.Defaults.suffixes)
        suffixes.remove("\\[")
        suffix_regex = compile_suffix_regex(suffixes)
        nlp.tokenizer.suffix_search = suffix_regex.search
        # add a special case
        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
    return customize_tokenizer
 ```
 When training, provide the function above with the `--code` option:
 ```cli
 $ python -m spacy train config.cfg --code ./functions.py
 ```
 The train step requires the `--code` option with your registered functions from
 the `[initialize]` block, but since those callbacks are only required during the
 initialization step, you don't need to provide them with the final pipeline
 package. However, to make it easier for others to replicate your training setup,
 you can choose to package the initialization callbacks with the pipeline package
 or to publish them separately.
 #### Training via the Python API {#migrating-training-python}
 For most use cases, you **shouldn't** have to write your own training scripts