Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded
at the start of training rather than when the vocab is initialized,
since the vocab doesn't have access to the full config when it's
created.

The option moves from `nlp.load_vocab_data` to `training.lookups`.

Typically these tables will come from `spacy-lookups-data`, but any
`Lookups` object can be provided.

The loading from `spacy-lookups-data` is now strict, so configs for each
language should specify the exact tables required. This also makes it
easier to control whether the larger clusters and probs tables are
included.

To load `lexeme_norm` from `spacy-lookups-data`:

```
[training.lookups]
@misc = "spacy.LoadLookupsData.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
```
This commit is contained in:
Adriane Boyd 2020-09-18 15:45:55 +02:00
parent 0406200a1e
commit eed4b785f5
7 changed files with 30 additions and 16 deletions

View File

@ -88,6 +88,7 @@ def train(
sourced_components = get_sourced_components(config) sourced_components = get_sourced_components(config)
with show_validation_error(config_path): with show_validation_error(config_path):
nlp, config = util.load_model_from_config(config) nlp, config = util.load_model_from_config(config)
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
if config["training"]["vectors"] is not None: if config["training"]["vectors"] is not None:
util.load_vectors_into_model(nlp, config["training"]["vectors"]) util.load_vectors_into_model(nlp, config["training"]["vectors"])
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)

View File

@ -12,7 +12,6 @@ use_pytorch_for_gpu_memory = false
lang = null lang = null
pipeline = [] pipeline = []
disabled = [] disabled = []
load_vocab_data = true
before_creation = null before_creation = null
after_creation = null after_creation = null
after_pipeline_creation = null after_pipeline_creation = null
@ -58,6 +57,7 @@ accumulate_gradient = 1
init_tok2vec = ${paths.init_tok2vec} init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw} raw_text = ${paths.raw}
vectors = null vectors = null
lookups = null
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 1600
max_epochs = 0 max_epochs = 0

View File

@ -31,6 +31,7 @@ from .schemas import ConfigSchema
from .git_info import GIT_VERSION from .git_info import GIT_VERSION
from . import util from . import util
from . import about from . import about
from .lookups import load_lookups
# This is the base config will all settings (training etc.) # This is the base config will all settings (training etc.)
@ -86,6 +87,12 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory return tokenizer_factory
@registry.misc("spacy.LoadLookupsData.v1")
def load_lookups_data(lang, tables):
lookups = load_lookups(lang=lang, tables=tables)
return lookups
class Language: class Language:
"""A text-processing pipeline. Usually you'll load this once per process, """A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application. and pass the instance around your application.
@ -152,7 +159,6 @@ class Language:
self.lang, self.lang,
self.Defaults, self.Defaults,
vectors_name=vectors_name, vectors_name=vectors_name,
load_data=self._config["nlp"]["load_vocab_data"],
) )
else: else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang): if (self.lang and vocab.lang) and (self.lang != vocab.lang):

View File

@ -8,6 +8,7 @@ from collections import defaultdict
from thinc.api import Optimizer from thinc.api import Optimizer
from .attrs import NAMES from .attrs import NAMES
from .lookups import Lookups
if TYPE_CHECKING: if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports # This lets us add type hints for mypy etc. without causing circular imports
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
class ConfigSchemaTraining(BaseModel): class ConfigSchemaTraining(BaseModel):
# fmt: off # fmt: off
vectors: Optional[StrictStr] = Field(..., title="Path to vectors") vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
train_corpus: StrictStr = Field(..., title="Path in the config to the training data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data") batcher: Batcher = Field(..., title="Batcher for the training data")
@ -227,7 +229,6 @@ class ConfigSchemaNlp(BaseModel):
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default") disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
tokenizer: Callable = Field(..., title="The tokenizer to use") tokenizer: Callable = Field(..., title="The tokenizer to use")
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")

View File

@ -69,7 +69,6 @@ def test_util_dot_section():
[nlp] [nlp]
lang = "en" lang = "en"
pipeline = ["textcat"] pipeline = ["textcat"]
load_vocab_data = false
[components] [components]
@ -95,15 +94,13 @@ def test_util_dot_section():
# not exclusive_classes # not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten # Test that default values got overwritten
assert not en_config["nlp"]["load_vocab_data"] assert en_config["nlp"]["pipeline"] == ["textcat"]
assert nl_config["nlp"]["load_vocab_data"] # default value True assert nl_config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object' # Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError): with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.pipeline.tagger") dot_to_object(en_config, "nlp.pipeline.tagger")
with pytest.raises(KeyError): with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.unknownattribute") dot_to_object(en_config, "nlp.unknownattribute")
assert not dot_to_object(en_config, "nlp.load_vocab_data")
assert dot_to_object(nl_config, "nlp.load_vocab_data")
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)

View File

@ -253,6 +253,14 @@ def load_vectors_into_model(
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def load_vocab_data_into_model(
nlp: "Language", *, lookups: Optional["Lookups"]=None
) -> None:
"""Load vocab data."""
if lookups:
nlp.vocab.load_lookups(lookups)
def load_model( def load_model(
name: Union[str, Path], name: Union[str, Path],
*, *,

View File

@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, vectors_name=None, load_data=True): def create_vocab(lang, defaults, vectors_name=None):
# If the spacy-lookups-data package is installed, we pre-populate the lookups # If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available # with lexeme data, if available
if load_data:
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
lookups = load_lookups(lang, tables=tables, strict=False)
else:
lookups = Lookups()
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
# This is messy, but it's the minimal working fix to Issue #639. # This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words) lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
lex_attrs[NORM] = util.add_lookups( lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]), lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS, BASE_NORMS,
lookups.get_table("lexeme_norm", {}),
) )
return Vocab( return Vocab(
lex_attr_getters=lex_attrs, lex_attr_getters=lex_attrs,
lookups=lookups,
writing_system=defaults.writing_system, writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name, vectors_name=vectors_name,
@ -424,6 +417,14 @@ cdef class Vocab:
orth = self.strings.add(orth) orth = self.strings.add(orth)
return orth in self.vectors return orth in self.vectors
def load_lookups(self, lookups):
self.lookups = lookups
if lookups.has_table("lexeme_norm"):
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters[NORM],
lookups.get_table("lexeme_norm"),
)
def to_disk(self, path, *, exclude=tuple()): def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory. """Save the current state to a directory.