mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-29 13:23:40 +03:00
Merge remote-tracking branch 'upstream/develop' into feature/missing-dep
This commit is contained in:
commit
fec9b81aa2
|
@ -124,3 +124,5 @@ lookups = null
|
||||||
tokenizer = {}
|
tokenizer = {}
|
||||||
# Arguments for initialize methods of the components (keyed by component)
|
# Arguments for initialize methods of the components (keyed by component)
|
||||||
components = {}
|
components = {}
|
||||||
|
before_init = null
|
||||||
|
after_init = null
|
||||||
|
|
|
@ -1209,6 +1209,9 @@ class Language:
|
||||||
config = self.config.interpolate()
|
config = self.config.interpolate()
|
||||||
# These are the settings provided in the [initialize] block in the config
|
# These are the settings provided in the [initialize] block in the config
|
||||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
|
before_init = I["before_init"]
|
||||||
|
if before_init is not None:
|
||||||
|
before_init(self)
|
||||||
init_vocab(
|
init_vocab(
|
||||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||||
)
|
)
|
||||||
|
@ -1240,6 +1243,9 @@ class Language:
|
||||||
self._optimizer = sgd
|
self._optimizer = sgd
|
||||||
elif self._optimizer is None:
|
elif self._optimizer is None:
|
||||||
self._optimizer = self.create_optimizer()
|
self._optimizer = self.create_optimizer()
|
||||||
|
after_init = I["after_init"]
|
||||||
|
if after_init is not None:
|
||||||
|
after_init(self)
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
||||||
|
|
|
@ -256,8 +256,14 @@ class Tagger(TrainablePipe):
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#get_loss
|
DOCS: https://nightly.spacy.io/api/tagger#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Tagger.get_loss")
|
validate_examples(examples, "Tagger.get_loss")
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, missing_value="")
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||||
truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
|
# Convert empty tag "" to missing value None so that both misaligned
|
||||||
|
# tokens and tokens with missing annotation have the default missing
|
||||||
|
# value None.
|
||||||
|
truths = []
|
||||||
|
for eg in examples:
|
||||||
|
eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
|
||||||
|
truths.append(eg_truths)
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
if self.model.ops.xp.isnan(loss):
|
if self.model.ops.xp.isnan(loss):
|
||||||
raise ValueError(Errors.E910.format(name=self.name))
|
raise ValueError(Errors.E910.format(name=self.name))
|
||||||
|
|
|
@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
|
||||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||||
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
||||||
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
|
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
|
||||||
|
before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
|
||||||
|
after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -37,7 +37,16 @@ TRAIN_DATA = [
|
||||||
]
|
]
|
||||||
|
|
||||||
PARTIAL_DATA = [
|
PARTIAL_DATA = [
|
||||||
|
# partial annotation
|
||||||
("I like green eggs", {"tags": ["", "V", "J", ""]}),
|
("I like green eggs", {"tags": ["", "V", "J", ""]}),
|
||||||
|
# misaligned partial annotation
|
||||||
|
(
|
||||||
|
"He hates green eggs",
|
||||||
|
{
|
||||||
|
"words": ["He", "hate", "s", "green", "eggs"],
|
||||||
|
"tags": ["", "V", "S", "J", ""],
|
||||||
|
},
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ from spacy.util import registry, load_model_from_config, load_config
|
||||||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
||||||
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
||||||
|
from catalogue import RegistryError
|
||||||
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
@ -446,3 +447,21 @@ def test_config_validate_literal(parser_config_string):
|
||||||
nlp.add_pipe("parser", config=config)
|
nlp.add_pipe("parser", config=config)
|
||||||
config["model"]["state_type"] = "ner"
|
config["model"]["state_type"] = "ner"
|
||||||
nlp.add_pipe("parser", config=config)
|
nlp.add_pipe("parser", config=config)
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_only_resolve_relevant_blocks():
|
||||||
|
"""Test that only the relevant blocks are resolved in the different methods
|
||||||
|
and that invalid blocks are ignored if needed. For instance, the [initialize]
|
||||||
|
shouldn't be resolved at runtime.
|
||||||
|
"""
|
||||||
|
nlp = English()
|
||||||
|
config = nlp.config
|
||||||
|
config["training"]["before_to_disk"] = {"@misc": "nonexistent"}
|
||||||
|
config["initialize"]["lookups"] = {"@misc": "nonexistent"}
|
||||||
|
# This shouldn't resolve [training] or [initialize]
|
||||||
|
nlp = load_model_from_config(config, auto_fill=True)
|
||||||
|
# This will raise for nonexistent value
|
||||||
|
with pytest.raises(RegistryError):
|
||||||
|
nlp.initialize()
|
||||||
|
nlp.config["initialize"]["lookups"] = None
|
||||||
|
nlp.initialize()
|
||||||
|
|
|
@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
|
||||||
ran_before = False
|
ran_before = False
|
||||||
ran_after = False
|
ran_after = False
|
||||||
ran_after_pipeline = False
|
ran_after_pipeline = False
|
||||||
|
ran_before_init = False
|
||||||
|
ran_after_init = False
|
||||||
|
|
||||||
@registry.callbacks(f"{name}_before")
|
@registry.callbacks(f"{name}_before")
|
||||||
def make_before_creation():
|
def make_before_creation():
|
||||||
|
@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():
|
||||||
|
|
||||||
return after_pipeline_creation
|
return after_pipeline_creation
|
||||||
|
|
||||||
|
@registry.callbacks(f"{name}_before_init")
|
||||||
|
def make_before_init():
|
||||||
|
def before_init(nlp):
|
||||||
|
nonlocal ran_before_init
|
||||||
|
ran_before_init = True
|
||||||
|
nlp.meta["before_init"] = "before"
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
return before_init
|
||||||
|
|
||||||
|
@registry.callbacks(f"{name}_after_init")
|
||||||
|
def make_after_init():
|
||||||
|
def after_init(nlp):
|
||||||
|
nonlocal ran_after_init
|
||||||
|
ran_after_init = True
|
||||||
|
nlp.meta["after_init"] = "after"
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
return after_init
|
||||||
|
|
||||||
config = {
|
config = {
|
||||||
"nlp": {
|
"nlp": {
|
||||||
"pipeline": ["sentencizer"],
|
"pipeline": ["sentencizer"],
|
||||||
|
@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
|
||||||
"after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
|
"after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
|
||||||
},
|
},
|
||||||
"components": {"sentencizer": {"factory": "sentencizer"}},
|
"components": {"sentencizer": {"factory": "sentencizer"}},
|
||||||
|
"initialize": {
|
||||||
|
"before_init": {"@callbacks": f"{name}_before_init"},
|
||||||
|
"after_init": {"@callbacks": f"{name}_after_init"},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
nlp = English.from_config(config)
|
nlp = English.from_config(config)
|
||||||
assert all([ran_before, ran_after, ran_after_pipeline])
|
|
||||||
assert nlp.Defaults.foo == "bar"
|
assert nlp.Defaults.foo == "bar"
|
||||||
assert nlp.meta["foo"] == "bar"
|
assert nlp.meta["foo"] == "bar"
|
||||||
assert nlp.meta["bar"] == "baz"
|
assert nlp.meta["bar"] == "baz"
|
||||||
|
assert "before_init" not in nlp.meta
|
||||||
|
assert "after_init" not in nlp.meta
|
||||||
assert nlp.pipe_names == ["sentencizer"]
|
assert nlp.pipe_names == ["sentencizer"]
|
||||||
assert nlp("text")
|
assert nlp("text")
|
||||||
|
nlp.initialize()
|
||||||
|
assert nlp.meta["before_init"] == "before"
|
||||||
|
assert nlp.meta["after_init"] == "after"
|
||||||
|
assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
|
||||||
|
|
||||||
|
|
||||||
def test_language_from_config_before_after_init_invalid():
|
def test_language_from_config_before_after_init_invalid():
|
||||||
|
|
|
@ -59,6 +59,19 @@ def train(
|
||||||
batcher = T["batcher"]
|
batcher = T["batcher"]
|
||||||
train_logger = T["logger"]
|
train_logger = T["logger"]
|
||||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||||
|
|
||||||
|
# Helper function to save checkpoints. This is a closure for convenience,
|
||||||
|
# to avoid passing in all the args all the time.
|
||||||
|
def save_checkpoint(is_best):
|
||||||
|
with nlp.use_params(optimizer.averages):
|
||||||
|
before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST)
|
||||||
|
if is_best:
|
||||||
|
# Avoid saving twice (saving will be more expensive than
|
||||||
|
# the dir copy)
|
||||||
|
if (output_path / DIR_MODEL_BEST).exists():
|
||||||
|
shutil.rmtree(output_path / DIR_MODEL_BEST)
|
||||||
|
shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST)
|
||||||
|
|
||||||
# Components that shouldn't be updated during training
|
# Components that shouldn't be updated during training
|
||||||
frozen_components = T["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
# Create iterator, which yields out info after each optimization step.
|
# Create iterator, which yields out info after each optimization step.
|
||||||
|
@ -87,38 +100,29 @@ def train(
|
||||||
if is_best_checkpoint is not None and output_path is not None:
|
if is_best_checkpoint is not None and output_path is not None:
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
update_meta(T, nlp, info)
|
update_meta(T, nlp, info)
|
||||||
with nlp.use_params(optimizer.averages):
|
save_checkpoint(is_best_checkpoint)
|
||||||
nlp = before_to_disk(nlp)
|
|
||||||
nlp.to_disk(output_path / DIR_MODEL_LAST)
|
|
||||||
if is_best_checkpoint:
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
nlp.to_disk(output_path / DIR_MODEL_BEST)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
# We don't want to swallow the traceback if we don't have a
|
|
||||||
# specific error, but we do want to warn that we're trying
|
|
||||||
# to do something here.
|
|
||||||
stdout.write(
|
stdout.write(
|
||||||
msg.warn(
|
msg.warn(
|
||||||
f"Aborting and saving the final best model. "
|
f"Aborting and saving the final best model. "
|
||||||
f"Encountered exception: {str(e)}"
|
f"Encountered exception: {repr(e)}"
|
||||||
)
|
)
|
||||||
+ "\n"
|
+ "\n"
|
||||||
)
|
)
|
||||||
raise e
|
raise e
|
||||||
finally:
|
finally:
|
||||||
finalize_logger()
|
finalize_logger()
|
||||||
|
save_checkpoint(False)
|
||||||
|
# This will only run if we did't hit an error
|
||||||
if optimizer.averages:
|
if optimizer.averages:
|
||||||
nlp.use_params(optimizer.averages)
|
nlp.use_params(optimizer.averages)
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
final_model_path = output_path / DIR_MODEL_LAST
|
|
||||||
nlp.to_disk(final_model_path)
|
|
||||||
# This will only run if we don't hit an error
|
|
||||||
stdout.write(
|
stdout.write(
|
||||||
msg.good("Saved pipeline to output directory", final_model_path) + "\n"
|
msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST)
|
||||||
|
+ "\n"
|
||||||
)
|
)
|
||||||
return (nlp, final_model_path)
|
return (nlp, output_path / DIR_MODEL_LAST)
|
||||||
else:
|
else:
|
||||||
return (nlp, None)
|
return (nlp, None)
|
||||||
|
|
||||||
|
|
|
@ -716,7 +716,7 @@ that we want to classify as being related or not. As these candidate pairs are
|
||||||
typically formed within one document, this function takes a [`Doc`](/api/doc) as
|
typically formed within one document, this function takes a [`Doc`](/api/doc) as
|
||||||
input and outputs a `List` of `Span` tuples. For instance, the following
|
input and outputs a `List` of `Span` tuples. For instance, the following
|
||||||
implementation takes any two entities from the same document, as long as they
|
implementation takes any two entities from the same document, as long as they
|
||||||
are within a **maximum distance** (in number of tokens) of eachother:
|
are within a **maximum distance** (in number of tokens) of each other:
|
||||||
|
|
||||||
> #### config.cfg (excerpt)
|
> #### config.cfg (excerpt)
|
||||||
>
|
>
|
||||||
|
@ -742,7 +742,7 @@ def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]
|
||||||
return get_candidates
|
return get_candidates
|
||||||
```
|
```
|
||||||
|
|
||||||
This function in added to the [`@misc` registry](/api/top-level#registry) so we
|
This function is added to the [`@misc` registry](/api/top-level#registry) so we
|
||||||
can refer to it from the config, and easily swap it out for any other candidate
|
can refer to it from the config, and easily swap it out for any other candidate
|
||||||
generation function.
|
generation function.
|
||||||
|
|
||||||
|
|
|
@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal
|
||||||
modifications, like adjusting the
|
modifications, like adjusting the
|
||||||
[tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
|
[tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
|
||||||
[language defaults](/api/language#defaults) like stop words. The config lets you
|
[language defaults](/api/language#defaults) like stop words. The config lets you
|
||||||
provide three optional **callback functions** that give you access to the
|
provide five optional **callback functions** that give you access to the
|
||||||
language class and `nlp` object at different points of the lifecycle:
|
language class and `nlp` object at different points of the lifecycle:
|
||||||
|
|
||||||
| Callback | Description |
|
| Callback | Description |
|
||||||
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
|
| `nlp.before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
|
||||||
| `after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer. |
|
| `nlp.after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. |
|
||||||
| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
|
| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
|
||||||
|
| `initialize.before_init` | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option. |
|
||||||
|
| `initialize.after_init` | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification. |
|
||||||
|
|
||||||
The `@spacy.registry.callbacks` decorator lets you register your custom function
|
The `@spacy.registry.callbacks` decorator lets you register your custom function
|
||||||
in the `callbacks` [registry](/api/top-level#registry) under a given name. You
|
in the `callbacks` [registry](/api/top-level#registry) under a given name. You
|
||||||
|
@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
|
||||||
a block contains a key starting with an `@`, it's interpreted as a reference to
|
a block contains a key starting with an `@`, it's interpreted as a reference to
|
||||||
a function. Because you've registered the function, spaCy knows how to create it
|
a function. Because you've registered the function, spaCy knows how to create it
|
||||||
when you reference `"customize_language_data"` in your config. Here's an example
|
when you reference `"customize_language_data"` in your config. Here's an example
|
||||||
of a callback that runs before the `nlp` object is created and adds a few custom
|
of a callback that runs before the `nlp` object is created and adds a custom
|
||||||
tokenization rules to the defaults:
|
stop word to the defaults:
|
||||||
|
|
||||||
> #### config.cfg
|
> #### config.cfg
|
||||||
>
|
>
|
||||||
|
@ -643,7 +645,7 @@ import spacy
|
||||||
@spacy.registry.callbacks("customize_language_data")
|
@spacy.registry.callbacks("customize_language_data")
|
||||||
def create_callback():
|
def create_callback():
|
||||||
def customize_language_data(lang_cls):
|
def customize_language_data(lang_cls):
|
||||||
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
|
lang_cls.Defaults.stop_words.add("good")
|
||||||
return lang_cls
|
return lang_cls
|
||||||
|
|
||||||
return customize_language_data
|
return customize_language_data
|
||||||
|
@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### functions.py {highlight="5,8-10"}
|
### functions.py {highlight="5,7-9"}
|
||||||
from typing import List
|
from typing import List
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
@spacy.registry.callbacks("customize_language_data")
|
@spacy.registry.callbacks("customize_language_data")
|
||||||
def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
|
def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
|
||||||
def customize_language_data(lang_cls):
|
def customize_language_data(lang_cls):
|
||||||
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
|
lang_cls.Defaults.stop_words.update(extra_stop_words)
|
||||||
lang_cls.Defaults.stop_words.add(extra_stop_words)
|
|
||||||
if debug:
|
if debug:
|
||||||
print("Updated stop words and tokenizer suffixes")
|
print("Updated stop words")
|
||||||
return lang_cls
|
return lang_cls
|
||||||
|
|
||||||
return customize_language_data
|
return customize_language_data
|
||||||
|
@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
|
||||||
$ python -m spacy train config.cfg --output ./output --code ./functions.py
|
$ python -m spacy train config.cfg --output ./output --code ./functions.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Example: Modifying tokenizer settings {#custom-tokenizer}
|
||||||
|
|
||||||
|
Use the `initialize.before_init` callback to modify the tokenizer settings when
|
||||||
|
training a new pipeline. Write a registered callback that modifies the tokenizer
|
||||||
|
settings and specify this callback in your config:
|
||||||
|
|
||||||
|
> #### config.cfg
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize]
|
||||||
|
>
|
||||||
|
> [initialize.before_init]
|
||||||
|
> @callbacks = "customize_tokenizer"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### functions.py
|
||||||
|
from spacy.util import registry, compile_suffix_regex
|
||||||
|
|
||||||
|
@registry.callbacks("customize_tokenizer")
|
||||||
|
def make_customize_tokenizer():
|
||||||
|
def customize_tokenizer(nlp):
|
||||||
|
# remove a suffix
|
||||||
|
suffixes = list(nlp.Defaults.suffixes)
|
||||||
|
suffixes.remove("\\[")
|
||||||
|
suffix_regex = compile_suffix_regex(suffixes)
|
||||||
|
nlp.tokenizer.suffix_search = suffix_regex.search
|
||||||
|
|
||||||
|
# add a special case
|
||||||
|
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
|
||||||
|
return customize_tokenizer
|
||||||
|
```
|
||||||
|
|
||||||
|
When training, provide the function above with the `--code` option:
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy train config.cfg --code ./functions.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Because this callback is only called in the one-time initialization step before
|
||||||
|
training, the callback code does not need to be packaged with the final pipeline
|
||||||
|
package. However, to make it easier for others to replicate your training setup,
|
||||||
|
you can choose to package the initialization callbacks with the pipeline package
|
||||||
|
or to publish them separately.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
|
||||||
|
|
||||||
|
- `nlp.before_creation` is the best place to modify language defaults other than
|
||||||
|
the tokenizer settings.
|
||||||
|
- `initialize.before_init` is the best place to modify tokenizer settings when
|
||||||
|
training a new pipeline.
|
||||||
|
|
||||||
|
Unlike the other language defaults, the tokenizer settings are saved with the
|
||||||
|
pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
|
||||||
|
will be clobbered by the saved settings when the trained pipeline is loaded from
|
||||||
|
disk.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
#### Example: Custom logging function {#custom-logging}
|
#### Example: Custom logging function {#custom-logging}
|
||||||
|
|
||||||
During training, the results of each step are passed to a logger function. By
|
During training, the results of each step are passed to a logger function. By
|
||||||
|
|
|
@ -930,6 +930,55 @@ treebank.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
||||||
|
#### Modifying tokenizer settings
|
||||||
|
|
||||||
|
If you were using a base model with `spacy train` to customize the tokenizer
|
||||||
|
settings in v2, your modifications can be provided in the
|
||||||
|
`[initialize.before_init]` callback.
|
||||||
|
|
||||||
|
Write a registered callback that modifies the tokenizer settings and specify
|
||||||
|
this callback in your config:
|
||||||
|
|
||||||
|
> #### config.cfg
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize]
|
||||||
|
>
|
||||||
|
> [initialize.before_init]
|
||||||
|
> @callbacks = "customize_tokenizer"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### functions.py
|
||||||
|
from spacy.util import registry, compile_suffix_regex
|
||||||
|
|
||||||
|
@registry.callbacks("customize_tokenizer")
|
||||||
|
def make_customize_tokenizer():
|
||||||
|
def customize_tokenizer(nlp):
|
||||||
|
# remove a suffix
|
||||||
|
suffixes = list(nlp.Defaults.suffixes)
|
||||||
|
suffixes.remove("\\[")
|
||||||
|
suffix_regex = compile_suffix_regex(suffixes)
|
||||||
|
nlp.tokenizer.suffix_search = suffix_regex.search
|
||||||
|
|
||||||
|
# add a special case
|
||||||
|
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
|
||||||
|
return customize_tokenizer
|
||||||
|
```
|
||||||
|
|
||||||
|
When training, provide the function above with the `--code` option:
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy train config.cfg --code ./functions.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The train step requires the `--code` option with your registered functions from
|
||||||
|
the `[initialize]` block, but since those callbacks are only required during the
|
||||||
|
initialization step, you don't need to provide them with the final pipeline
|
||||||
|
package. However, to make it easier for others to replicate your training setup,
|
||||||
|
you can choose to package the initialization callbacks with the pipeline package
|
||||||
|
or to publish them separately.
|
||||||
|
|
||||||
#### Training via the Python API {#migrating-training-python}
|
#### Training via the Python API {#migrating-training-python}
|
||||||
|
|
||||||
For most use cases, you **shouldn't** have to write your own training scripts
|
For most use cases, you **shouldn't** have to write your own training scripts
|
||||||
|
|
Loading…
Reference in New Issue
Block a user