Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded
at the start of training rather than when the vocab is initialized,
since the vocab doesn't have access to the full config when it's
created.

The option moves from `nlp.load_vocab_data` to `training.lookups`.

Typically these tables will come from `spacy-lookups-data`, but any
`Lookups` object can be provided.

The loading from `spacy-lookups-data` is now strict, so configs for each
language should specify the exact tables required. This also makes it
easier to control whether the larger clusters and probs tables are
included.

To load `lexeme_norm` from `spacy-lookups-data`:

```
[training.lookups]
@misc = "spacy.LoadLookupsData.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
```
This commit is contained in:
Adriane Boyd 2020-09-18 15:45:55 +02:00
parent 0406200a1e
commit eed4b785f5
7 changed files with 30 additions and 16 deletions

View File

@ -88,6 +88,7 @@ def train(
sourced_components = get_sourced_components(config)
with show_validation_error(config_path):
nlp, config = util.load_model_from_config(config)
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
if config["training"]["vectors"] is not None:
util.load_vectors_into_model(nlp, config["training"]["vectors"])
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)

View File

@ -12,7 +12,6 @@ use_pytorch_for_gpu_memory = false
lang = null
pipeline = []
disabled = []
load_vocab_data = true
before_creation = null
after_creation = null
after_pipeline_creation = null
@ -58,6 +57,7 @@ accumulate_gradient = 1
init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}
vectors = null
lookups = null
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
max_epochs = 0

View File

@ -31,6 +31,7 @@ from .schemas import ConfigSchema
from .git_info import GIT_VERSION
from . import util
from . import about
from .lookups import load_lookups
# This is the base config will all settings (training etc.)
@ -86,6 +87,12 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory
@registry.misc("spacy.LoadLookupsData.v1")
def load_lookups_data(lang, tables):
lookups = load_lookups(lang=lang, tables=tables)
return lookups
class Language:
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
@ -152,7 +159,6 @@ class Language:
self.lang,
self.Defaults,
vectors_name=vectors_name,
load_data=self._config["nlp"]["load_vocab_data"],
)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):

View File

@ -8,6 +8,7 @@ from collections import defaultdict
from thinc.api import Optimizer
from .attrs import NAMES
from .lookups import Lookups
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
class ConfigSchemaTraining(BaseModel):
# fmt: off
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data")
@ -227,7 +229,6 @@ class ConfigSchemaNlp(BaseModel):
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
tokenizer: Callable = Field(..., title="The tokenizer to use")
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")

View File

@ -69,7 +69,6 @@ def test_util_dot_section():
[nlp]
lang = "en"
pipeline = ["textcat"]
load_vocab_data = false
[components]
@ -95,15 +94,13 @@ def test_util_dot_section():
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert not en_config["nlp"]["load_vocab_data"]
assert nl_config["nlp"]["load_vocab_data"] # default value True
assert en_config["nlp"]["pipeline"] == ["textcat"]
assert nl_config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.unknownattribute")
assert not dot_to_object(en_config, "nlp.load_vocab_data")
assert dot_to_object(nl_config, "nlp.load_vocab_data")
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)

View File

@ -253,6 +253,14 @@ def load_vectors_into_model(
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def load_vocab_data_into_model(
nlp: "Language", *, lookups: Optional["Lookups"]=None
) -> None:
"""Load vocab data."""
if lookups:
nlp.vocab.load_lookups(lookups)
def load_model(
name: Union[str, Path],
*,

View File

@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, vectors_name=None, load_data=True):
def create_vocab(lang, defaults, vectors_name=None):
# If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available
if load_data:
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
lookups = load_lookups(lang, tables=tables, strict=False)
else:
lookups = Lookups()
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
# This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS,
lookups.get_table("lexeme_norm", {}),
)
return Vocab(
lex_attr_getters=lex_attrs,
lookups=lookups,
writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name,
@ -424,6 +417,14 @@ cdef class Vocab:
orth = self.strings.add(orth)
return orth in self.vectors
def load_lookups(self, lookups):
self.lookups = lookups
if lookups.has_table("lexeme_norm"):
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters[NORM],
lookups.get_table("lexeme_norm"),
)
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.