mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/develop' into feature/missing-dep
This commit is contained in:
		
						commit
						fec9b81aa2
					
				| 
						 | 
				
			
			@ -124,3 +124,5 @@ lookups = null
 | 
			
		|||
tokenizer = {}
 | 
			
		||||
# Arguments for initialize methods of the components (keyed by component)
 | 
			
		||||
components = {}
 | 
			
		||||
before_init = null
 | 
			
		||||
after_init = null
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1209,6 +1209,9 @@ class Language:
 | 
			
		|||
        config = self.config.interpolate()
 | 
			
		||||
        # These are the settings provided in the [initialize] block in the config
 | 
			
		||||
        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
 | 
			
		||||
        before_init = I["before_init"]
 | 
			
		||||
        if before_init is not None:
 | 
			
		||||
            before_init(self)
 | 
			
		||||
        init_vocab(
 | 
			
		||||
            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
 | 
			
		||||
        )
 | 
			
		||||
| 
						 | 
				
			
			@ -1240,6 +1243,9 @@ class Language:
 | 
			
		|||
            self._optimizer = sgd
 | 
			
		||||
        elif self._optimizer is None:
 | 
			
		||||
            self._optimizer = self.create_optimizer()
 | 
			
		||||
        after_init = I["after_init"]
 | 
			
		||||
        if after_init is not None:
 | 
			
		||||
            after_init(self)
 | 
			
		||||
        return self._optimizer
 | 
			
		||||
 | 
			
		||||
    def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -256,8 +256,14 @@ class Tagger(TrainablePipe):
 | 
			
		|||
        DOCS: https://nightly.spacy.io/api/tagger#get_loss
 | 
			
		||||
        """
 | 
			
		||||
        validate_examples(examples, "Tagger.get_loss")
 | 
			
		||||
        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, missing_value="")
 | 
			
		||||
        truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
 | 
			
		||||
        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
 | 
			
		||||
        # Convert empty tag "" to missing value None so that both misaligned
 | 
			
		||||
        # tokens and tokens with missing annotation have the default missing
 | 
			
		||||
        # value None.
 | 
			
		||||
        truths = []
 | 
			
		||||
        for eg in examples:
 | 
			
		||||
            eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
 | 
			
		||||
            truths.append(eg_truths)
 | 
			
		||||
        d_scores, loss = loss_func(scores, truths)
 | 
			
		||||
        if self.model.ops.xp.isnan(loss):
 | 
			
		||||
            raise ValueError(Errors.E910.format(name=self.name))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
 | 
			
		|||
    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
 | 
			
		||||
    tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
 | 
			
		||||
    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
 | 
			
		||||
    before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
 | 
			
		||||
    after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
 | 
			
		||||
    # fmt: on
 | 
			
		||||
 | 
			
		||||
    class Config:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -37,7 +37,16 @@ TRAIN_DATA = [
 | 
			
		|||
]
 | 
			
		||||
 | 
			
		||||
PARTIAL_DATA = [
 | 
			
		||||
    # partial annotation
 | 
			
		||||
    ("I like green eggs", {"tags": ["", "V", "J", ""]}),
 | 
			
		||||
    # misaligned partial annotation
 | 
			
		||||
    (
 | 
			
		||||
        "He hates green eggs",
 | 
			
		||||
        {
 | 
			
		||||
            "words": ["He", "hate", "s", "green", "eggs"],
 | 
			
		||||
            "tags": ["", "V", "S", "J", ""],
 | 
			
		||||
        },
 | 
			
		||||
    ),
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,6 +8,7 @@ from spacy.util import registry, load_model_from_config, load_config
 | 
			
		|||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
 | 
			
		||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
 | 
			
		||||
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
 | 
			
		||||
from catalogue import RegistryError
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from ..util import make_tempdir
 | 
			
		||||
| 
						 | 
				
			
			@ -446,3 +447,21 @@ def test_config_validate_literal(parser_config_string):
 | 
			
		|||
        nlp.add_pipe("parser", config=config)
 | 
			
		||||
    config["model"]["state_type"] = "ner"
 | 
			
		||||
    nlp.add_pipe("parser", config=config)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_config_only_resolve_relevant_blocks():
 | 
			
		||||
    """Test that only the relevant blocks are resolved in the different methods
 | 
			
		||||
    and that invalid blocks are ignored if needed. For instance, the [initialize]
 | 
			
		||||
    shouldn't be resolved at runtime.
 | 
			
		||||
    """
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    config = nlp.config
 | 
			
		||||
    config["training"]["before_to_disk"] = {"@misc": "nonexistent"}
 | 
			
		||||
    config["initialize"]["lookups"] = {"@misc": "nonexistent"}
 | 
			
		||||
    # This shouldn't resolve [training] or [initialize]
 | 
			
		||||
    nlp = load_model_from_config(config, auto_fill=True)
 | 
			
		||||
    # This will raise for nonexistent value
 | 
			
		||||
    with pytest.raises(RegistryError):
 | 
			
		||||
        nlp.initialize()
 | 
			
		||||
    nlp.config["initialize"]["lookups"] = None
 | 
			
		||||
    nlp.initialize()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
 | 
			
		|||
    ran_before = False
 | 
			
		||||
    ran_after = False
 | 
			
		||||
    ran_after_pipeline = False
 | 
			
		||||
    ran_before_init = False
 | 
			
		||||
    ran_after_init = False
 | 
			
		||||
 | 
			
		||||
    @registry.callbacks(f"{name}_before")
 | 
			
		||||
    def make_before_creation():
 | 
			
		||||
| 
						 | 
				
			
			@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():
 | 
			
		|||
 | 
			
		||||
        return after_pipeline_creation
 | 
			
		||||
 | 
			
		||||
    @registry.callbacks(f"{name}_before_init")
 | 
			
		||||
    def make_before_init():
 | 
			
		||||
        def before_init(nlp):
 | 
			
		||||
            nonlocal ran_before_init
 | 
			
		||||
            ran_before_init = True
 | 
			
		||||
            nlp.meta["before_init"] = "before"
 | 
			
		||||
            return nlp
 | 
			
		||||
 | 
			
		||||
        return before_init
 | 
			
		||||
 | 
			
		||||
    @registry.callbacks(f"{name}_after_init")
 | 
			
		||||
    def make_after_init():
 | 
			
		||||
        def after_init(nlp):
 | 
			
		||||
            nonlocal ran_after_init
 | 
			
		||||
            ran_after_init = True
 | 
			
		||||
            nlp.meta["after_init"] = "after"
 | 
			
		||||
            return nlp
 | 
			
		||||
 | 
			
		||||
        return after_init
 | 
			
		||||
 | 
			
		||||
    config = {
 | 
			
		||||
        "nlp": {
 | 
			
		||||
            "pipeline": ["sentencizer"],
 | 
			
		||||
| 
						 | 
				
			
			@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
 | 
			
		|||
            "after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
 | 
			
		||||
        },
 | 
			
		||||
        "components": {"sentencizer": {"factory": "sentencizer"}},
 | 
			
		||||
        "initialize": {
 | 
			
		||||
            "before_init": {"@callbacks": f"{name}_before_init"},
 | 
			
		||||
            "after_init": {"@callbacks": f"{name}_after_init"},
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
    nlp = English.from_config(config)
 | 
			
		||||
    assert all([ran_before, ran_after, ran_after_pipeline])
 | 
			
		||||
    assert nlp.Defaults.foo == "bar"
 | 
			
		||||
    assert nlp.meta["foo"] == "bar"
 | 
			
		||||
    assert nlp.meta["bar"] == "baz"
 | 
			
		||||
    assert "before_init" not in nlp.meta
 | 
			
		||||
    assert "after_init" not in nlp.meta
 | 
			
		||||
    assert nlp.pipe_names == ["sentencizer"]
 | 
			
		||||
    assert nlp("text")
 | 
			
		||||
    nlp.initialize()
 | 
			
		||||
    assert nlp.meta["before_init"] == "before"
 | 
			
		||||
    assert nlp.meta["after_init"] == "after"
 | 
			
		||||
    assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_language_from_config_before_after_init_invalid():
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -59,6 +59,19 @@ def train(
 | 
			
		|||
    batcher = T["batcher"]
 | 
			
		||||
    train_logger = T["logger"]
 | 
			
		||||
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
 | 
			
		||||
 | 
			
		||||
    # Helper function to save checkpoints. This is a closure for convenience,
 | 
			
		||||
    # to avoid passing in all the args all the time.
 | 
			
		||||
    def save_checkpoint(is_best):
 | 
			
		||||
        with nlp.use_params(optimizer.averages):
 | 
			
		||||
            before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST)
 | 
			
		||||
        if is_best:
 | 
			
		||||
            # Avoid saving twice (saving will be more expensive than
 | 
			
		||||
            # the dir copy)
 | 
			
		||||
            if (output_path / DIR_MODEL_BEST).exists():
 | 
			
		||||
                shutil.rmtree(output_path / DIR_MODEL_BEST)
 | 
			
		||||
            shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST)
 | 
			
		||||
 | 
			
		||||
    # Components that shouldn't be updated during training
 | 
			
		||||
    frozen_components = T["frozen_components"]
 | 
			
		||||
    # Create iterator, which yields out info after each optimization step.
 | 
			
		||||
| 
						 | 
				
			
			@ -87,40 +100,31 @@ def train(
 | 
			
		|||
            if is_best_checkpoint is not None and output_path is not None:
 | 
			
		||||
                with nlp.select_pipes(disable=frozen_components):
 | 
			
		||||
                    update_meta(T, nlp, info)
 | 
			
		||||
                    with nlp.use_params(optimizer.averages):
 | 
			
		||||
                        nlp = before_to_disk(nlp)
 | 
			
		||||
                        nlp.to_disk(output_path / DIR_MODEL_LAST)
 | 
			
		||||
                    if is_best_checkpoint:
 | 
			
		||||
                        with nlp.use_params(optimizer.averages):
 | 
			
		||||
                            nlp.to_disk(output_path / DIR_MODEL_BEST)
 | 
			
		||||
 | 
			
		||||
                save_checkpoint(is_best_checkpoint)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        if output_path is not None:
 | 
			
		||||
            # We don't want to swallow the traceback if we don't have a
 | 
			
		||||
            # specific error, but we do want to warn that we're trying
 | 
			
		||||
            # to do something here.
 | 
			
		||||
            stdout.write(
 | 
			
		||||
                msg.warn(
 | 
			
		||||
                    f"Aborting and saving the final best model. "
 | 
			
		||||
                    f"Encountered exception: {str(e)}"
 | 
			
		||||
                    f"Encountered exception: {repr(e)}"
 | 
			
		||||
                )
 | 
			
		||||
                + "\n"
 | 
			
		||||
            )
 | 
			
		||||
        raise e
 | 
			
		||||
    finally:
 | 
			
		||||
        finalize_logger()
 | 
			
		||||
        if optimizer.averages:
 | 
			
		||||
            nlp.use_params(optimizer.averages)
 | 
			
		||||
        if output_path is not None:
 | 
			
		||||
            final_model_path = output_path / DIR_MODEL_LAST
 | 
			
		||||
            nlp.to_disk(final_model_path)
 | 
			
		||||
            # This will only run if we don't hit an error
 | 
			
		||||
            stdout.write(
 | 
			
		||||
                msg.good("Saved pipeline to output directory", final_model_path) + "\n"
 | 
			
		||||
            )
 | 
			
		||||
            return (nlp, final_model_path)
 | 
			
		||||
        else:
 | 
			
		||||
            return (nlp, None)
 | 
			
		||||
        save_checkpoint(False)
 | 
			
		||||
    # This will only run if we did't hit an error
 | 
			
		||||
    if optimizer.averages:
 | 
			
		||||
        nlp.use_params(optimizer.averages)
 | 
			
		||||
    if output_path is not None:
 | 
			
		||||
        stdout.write(
 | 
			
		||||
            msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST)
 | 
			
		||||
            + "\n"
 | 
			
		||||
        )
 | 
			
		||||
        return (nlp, output_path / DIR_MODEL_LAST)
 | 
			
		||||
    else:
 | 
			
		||||
        return (nlp, None)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def train_while_improving(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -716,7 +716,7 @@ that we want to classify as being related or not. As these candidate pairs are
 | 
			
		|||
typically formed within one document, this function takes a [`Doc`](/api/doc) as
 | 
			
		||||
input and outputs a `List` of `Span` tuples. For instance, the following
 | 
			
		||||
implementation takes any two entities from the same document, as long as they
 | 
			
		||||
are within a **maximum distance** (in number of tokens) of eachother:
 | 
			
		||||
are within a **maximum distance** (in number of tokens) of each other:
 | 
			
		||||
 | 
			
		||||
> #### config.cfg (excerpt)
 | 
			
		||||
>
 | 
			
		||||
| 
						 | 
				
			
			@ -742,7 +742,7 @@ def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]
 | 
			
		|||
    return get_candidates
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
This function in added to the [`@misc` registry](/api/top-level#registry) so we
 | 
			
		||||
This function is added to the [`@misc` registry](/api/top-level#registry) so we
 | 
			
		||||
can refer to it from the config, and easily swap it out for any other candidate
 | 
			
		||||
generation function.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal
 | 
			
		|||
modifications, like adjusting the
 | 
			
		||||
[tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
 | 
			
		||||
[language defaults](/api/language#defaults) like stop words. The config lets you
 | 
			
		||||
provide three optional **callback functions** that give you access to the
 | 
			
		||||
provide five optional **callback functions** that give you access to the
 | 
			
		||||
language class and `nlp` object at different points of the lifecycle:
 | 
			
		||||
 | 
			
		||||
| Callback                  | Description                                                                                                                                                                              |
 | 
			
		||||
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
 | 
			
		||||
| `after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer.          |
 | 
			
		||||
| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                |
 | 
			
		||||
| Callback                      | Description                                                                                                                                                                                                                |
 | 
			
		||||
| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `nlp.before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
 | 
			
		||||
| `nlp.after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object.                                                                                |
 | 
			
		||||
| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                                                  |
 | 
			
		||||
| `initialize.before_init`      | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option.                           |
 | 
			
		||||
| `initialize.after_init`       | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification.                                                                                                              |
 | 
			
		||||
 | 
			
		||||
The `@spacy.registry.callbacks` decorator lets you register your custom function
 | 
			
		||||
in the `callbacks` [registry](/api/top-level#registry) under a given name. You
 | 
			
		||||
| 
						 | 
				
			
			@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
 | 
			
		|||
a block contains a key starting with an `@`, it's interpreted as a reference to
 | 
			
		||||
a function. Because you've registered the function, spaCy knows how to create it
 | 
			
		||||
when you reference `"customize_language_data"` in your config. Here's an example
 | 
			
		||||
of a callback that runs before the `nlp` object is created and adds a few custom
 | 
			
		||||
tokenization rules to the defaults:
 | 
			
		||||
of a callback that runs before the `nlp` object is created and adds a custom
 | 
			
		||||
stop word to the defaults:
 | 
			
		||||
 | 
			
		||||
> #### config.cfg
 | 
			
		||||
>
 | 
			
		||||
| 
						 | 
				
			
			@ -643,7 +645,7 @@ import spacy
 | 
			
		|||
@spacy.registry.callbacks("customize_language_data")
 | 
			
		||||
def create_callback():
 | 
			
		||||
    def customize_language_data(lang_cls):
 | 
			
		||||
        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
 | 
			
		||||
        lang_cls.Defaults.stop_words.add("good")
 | 
			
		||||
        return lang_cls
 | 
			
		||||
 | 
			
		||||
    return customize_language_data
 | 
			
		||||
| 
						 | 
				
			
			@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
### functions.py {highlight="5,8-10"}
 | 
			
		||||
### functions.py {highlight="5,7-9"}
 | 
			
		||||
from typing import List
 | 
			
		||||
import spacy
 | 
			
		||||
 | 
			
		||||
@spacy.registry.callbacks("customize_language_data")
 | 
			
		||||
def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
 | 
			
		||||
    def customize_language_data(lang_cls):
 | 
			
		||||
        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
 | 
			
		||||
        lang_cls.Defaults.stop_words.add(extra_stop_words)
 | 
			
		||||
        lang_cls.Defaults.stop_words.update(extra_stop_words)
 | 
			
		||||
        if debug:
 | 
			
		||||
            print("Updated stop words and tokenizer suffixes")
 | 
			
		||||
            print("Updated stop words")
 | 
			
		||||
        return lang_cls
 | 
			
		||||
 | 
			
		||||
    return customize_language_data
 | 
			
		||||
| 
						 | 
				
			
			@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
 | 
			
		|||
$ python -m spacy train config.cfg --output ./output --code ./functions.py
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
#### Example: Modifying tokenizer settings {#custom-tokenizer}
 | 
			
		||||
 | 
			
		||||
Use the `initialize.before_init` callback to modify the tokenizer settings when
 | 
			
		||||
training a new pipeline. Write a registered callback that modifies the tokenizer
 | 
			
		||||
settings and specify this callback in your config:
 | 
			
		||||
 | 
			
		||||
> #### config.cfg
 | 
			
		||||
>
 | 
			
		||||
> ```ini
 | 
			
		||||
> [initialize]
 | 
			
		||||
>
 | 
			
		||||
> [initialize.before_init]
 | 
			
		||||
> @callbacks = "customize_tokenizer"
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
### functions.py
 | 
			
		||||
from spacy.util import registry, compile_suffix_regex
 | 
			
		||||
 | 
			
		||||
@registry.callbacks("customize_tokenizer")
 | 
			
		||||
def make_customize_tokenizer():
 | 
			
		||||
    def customize_tokenizer(nlp):
 | 
			
		||||
        # remove a suffix
 | 
			
		||||
        suffixes = list(nlp.Defaults.suffixes)
 | 
			
		||||
        suffixes.remove("\\[")
 | 
			
		||||
        suffix_regex = compile_suffix_regex(suffixes)
 | 
			
		||||
        nlp.tokenizer.suffix_search = suffix_regex.search
 | 
			
		||||
 | 
			
		||||
        # add a special case
 | 
			
		||||
        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
 | 
			
		||||
    return customize_tokenizer
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
When training, provide the function above with the `--code` option:
 | 
			
		||||
 | 
			
		||||
```cli
 | 
			
		||||
$ python -m spacy train config.cfg --code ./functions.py
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Because this callback is only called in the one-time initialization step before
 | 
			
		||||
training, the callback code does not need to be packaged with the final pipeline
 | 
			
		||||
package. However, to make it easier for others to replicate your training setup,
 | 
			
		||||
you can choose to package the initialization callbacks with the pipeline package
 | 
			
		||||
or to publish them separately.
 | 
			
		||||
 | 
			
		||||
<Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
 | 
			
		||||
 | 
			
		||||
- `nlp.before_creation` is the best place to modify language defaults other than
 | 
			
		||||
  the tokenizer settings.
 | 
			
		||||
- `initialize.before_init` is the best place to modify tokenizer settings when
 | 
			
		||||
  training a new pipeline.
 | 
			
		||||
 | 
			
		||||
Unlike the other language defaults, the tokenizer settings are saved with the
 | 
			
		||||
pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
 | 
			
		||||
will be clobbered by the saved settings when the trained pipeline is loaded from
 | 
			
		||||
disk.
 | 
			
		||||
 | 
			
		||||
</Infobox>
 | 
			
		||||
 | 
			
		||||
#### Example: Custom logging function {#custom-logging}
 | 
			
		||||
 | 
			
		||||
During training, the results of each step are passed to a logger function. By
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -930,6 +930,55 @@ treebank.
 | 
			
		|||
 | 
			
		||||
</Project>
 | 
			
		||||
 | 
			
		||||
#### Modifying tokenizer settings
 | 
			
		||||
 | 
			
		||||
If you were using a base model with `spacy train` to customize the tokenizer
 | 
			
		||||
settings in v2, your modifications can be provided in the
 | 
			
		||||
`[initialize.before_init]` callback.
 | 
			
		||||
 | 
			
		||||
Write a registered callback that modifies the tokenizer settings and specify
 | 
			
		||||
this callback in your config:
 | 
			
		||||
 | 
			
		||||
> #### config.cfg
 | 
			
		||||
>
 | 
			
		||||
> ```ini
 | 
			
		||||
> [initialize]
 | 
			
		||||
>
 | 
			
		||||
> [initialize.before_init]
 | 
			
		||||
> @callbacks = "customize_tokenizer"
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
### functions.py
 | 
			
		||||
from spacy.util import registry, compile_suffix_regex
 | 
			
		||||
 | 
			
		||||
@registry.callbacks("customize_tokenizer")
 | 
			
		||||
def make_customize_tokenizer():
 | 
			
		||||
    def customize_tokenizer(nlp):
 | 
			
		||||
        # remove a suffix
 | 
			
		||||
        suffixes = list(nlp.Defaults.suffixes)
 | 
			
		||||
        suffixes.remove("\\[")
 | 
			
		||||
        suffix_regex = compile_suffix_regex(suffixes)
 | 
			
		||||
        nlp.tokenizer.suffix_search = suffix_regex.search
 | 
			
		||||
 | 
			
		||||
        # add a special case
 | 
			
		||||
        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
 | 
			
		||||
    return customize_tokenizer
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
When training, provide the function above with the `--code` option:
 | 
			
		||||
 | 
			
		||||
```cli
 | 
			
		||||
$ python -m spacy train config.cfg --code ./functions.py
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
The train step requires the `--code` option with your registered functions from
 | 
			
		||||
the `[initialize]` block, but since those callbacks are only required during the
 | 
			
		||||
initialization step, you don't need to provide them with the final pipeline
 | 
			
		||||
package. However, to make it easier for others to replicate your training setup,
 | 
			
		||||
you can choose to package the initialization callbacks with the pipeline package
 | 
			
		||||
or to publish them separately.
 | 
			
		||||
 | 
			
		||||
#### Training via the Python API {#migrating-training-python}
 | 
			
		||||
 | 
			
		||||
For most use cases, you **shouldn't** have to write your own training scripts
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user