From eed4b785f51fcff2783e06306441f55437fc95fb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 18 Sep 2020 15:45:55 +0200 Subject: [PATCH 1/2] Load vocab lookups tables at beginning of training Similar to how vectors are handled, move the vocab lookups to be loaded at the start of training rather than when the vocab is initialized, since the vocab doesn't have access to the full config when it's created. The option moves from `nlp.load_vocab_data` to `training.lookups`. Typically these tables will come from `spacy-lookups-data`, but any `Lookups` object can be provided. The loading from `spacy-lookups-data` is now strict, so configs for each language should specify the exact tables required. This also makes it easier to control whether the larger clusters and probs tables are included. To load `lexeme_norm` from `spacy-lookups-data`: ``` [training.lookups] @misc = "spacy.LoadLookupsData.v1" lang = ${nlp.lang} tables = ["lexeme_norm"] ``` --- spacy/cli/train.py | 1 + spacy/default_config.cfg | 2 +- spacy/language.py | 8 +++++++- spacy/schemas.py | 3 ++- spacy/tests/test_util.py | 7 ++----- spacy/util.py | 8 ++++++++ spacy/vocab.pyx | 17 +++++++++-------- 7 files changed, 30 insertions(+), 16 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 50306b350..c6b39c289 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -88,6 +88,7 @@ def train( sourced_components = get_sourced_components(config) with show_validation_error(config_path): nlp, config = util.load_model_from_config(config) + util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"]) if config["training"]["vectors"] is not None: util.load_vectors_into_model(nlp, config["training"]["vectors"]) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index c7c9593d7..1517421f0 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -12,7 +12,6 @@ use_pytorch_for_gpu_memory = false lang = null pipeline = [] disabled = [] -load_vocab_data = true before_creation = null after_creation = null after_pipeline_creation = null @@ -58,6 +57,7 @@ accumulate_gradient = 1 init_tok2vec = ${paths.init_tok2vec} raw_text = ${paths.raw} vectors = null +lookups = null # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 max_epochs = 0 diff --git a/spacy/language.py b/spacy/language.py index d530e6b92..1d0990c55 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -31,6 +31,7 @@ from .schemas import ConfigSchema from .git_info import GIT_VERSION from . import util from . import about +from .lookups import load_lookups # This is the base config will all settings (training etc.) @@ -86,6 +87,12 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: return tokenizer_factory +@registry.misc("spacy.LoadLookupsData.v1") +def load_lookups_data(lang, tables): + lookups = load_lookups(lang=lang, tables=tables) + return lookups + + class Language: """A text-processing pipeline. Usually you'll load this once per process, and pass the instance around your application. @@ -152,7 +159,6 @@ class Language: self.lang, self.Defaults, vectors_name=vectors_name, - load_data=self._config["nlp"]["load_vocab_data"], ) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): diff --git a/spacy/schemas.py b/spacy/schemas.py index 06bc4beed..c72b5ca8b 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -8,6 +8,7 @@ from collections import defaultdict from thinc.api import Optimizer from .attrs import NAMES +from .lookups import Lookups if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports @@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off vectors: Optional[StrictStr] = Field(..., title="Path to vectors") + lookups: Optional[Lookups] = Field(..., title="Vocab lookups") dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") @@ -227,7 +229,6 @@ class ConfigSchemaNlp(BaseModel): pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default") tokenizer: Callable = Field(..., title="The tokenizer to use") - load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 1f073ab32..8c931d31e 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -69,7 +69,6 @@ def test_util_dot_section(): [nlp] lang = "en" pipeline = ["textcat"] - load_vocab_data = false [components] @@ -95,15 +94,13 @@ def test_util_dot_section(): # not exclusive_classes assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False # Test that default values got overwritten - assert not en_config["nlp"]["load_vocab_data"] - assert nl_config["nlp"]["load_vocab_data"] # default value True + assert en_config["nlp"]["pipeline"] == ["textcat"] + assert nl_config["nlp"]["pipeline"] == [] # default value [] # Test proper functioning of 'dot_to_object' with pytest.raises(KeyError): dot_to_object(en_config, "nlp.pipeline.tagger") with pytest.raises(KeyError): dot_to_object(en_config, "nlp.unknownattribute") - assert not dot_to_object(en_config, "nlp.load_vocab_data") - assert dot_to_object(nl_config, "nlp.load_vocab_data") assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) diff --git a/spacy/util.py b/spacy/util.py index 18b34e4d6..2e285a128 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -253,6 +253,14 @@ def load_vectors_into_model( nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) +def load_vocab_data_into_model( + nlp: "Language", *, lookups: Optional["Lookups"]=None +) -> None: + """Load vocab data.""" + if lookups: + nlp.vocab.load_lookups(lookups) + + def load_model( name: Union[str, Path], *, diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ef0847e54..94289036a 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, vectors_name=None, load_data=True): +def create_vocab(lang, defaults, vectors_name=None): # If the spacy-lookups-data package is installed, we pre-populate the lookups # with lexeme data, if available - if load_data: - tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] - lookups = load_lookups(lang, tables=tables, strict=False) - else: - lookups = Lookups() lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} # This is messy, but it's the minimal working fix to Issue #639. lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words) @@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True): lex_attrs[NORM] = util.add_lookups( lex_attrs.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, - lookups.get_table("lexeme_norm", {}), ) return Vocab( lex_attr_getters=lex_attrs, - lookups=lookups, writing_system=defaults.writing_system, get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), vectors_name=vectors_name, @@ -424,6 +417,14 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors + def load_lookups(self, lookups): + self.lookups = lookups + if lookups.has_table("lexeme_norm"): + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters[NORM], + lookups.get_table("lexeme_norm"), + ) + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. From 47080fba98bf7efd7432a0ac831d5715fad91a59 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 18 Sep 2020 19:43:19 +0200 Subject: [PATCH 2/2] Minor renaming / refactoring * Rename loader to `spacy.LookupsDataLoader.v1`, add debugging message * Make `Vocab.lookups` a property --- spacy/language.py | 3 ++- spacy/util.py | 2 +- spacy/vocab.pxd | 2 +- spacy/vocab.pyx | 19 ++++++++++++------- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 1d0990c55..7d463731a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -87,8 +87,9 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: return tokenizer_factory -@registry.misc("spacy.LoadLookupsData.v1") +@registry.misc("spacy.LookupsDataLoader.v1") def load_lookups_data(lang, tables): + util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") lookups = load_lookups(lang=lang, tables=tables) return lookups diff --git a/spacy/util.py b/spacy/util.py index 2e285a128..88162b23a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -258,7 +258,7 @@ def load_vocab_data_into_model( ) -> None: """Load vocab data.""" if lookups: - nlp.vocab.load_lookups(lookups) + nlp.vocab.lookups = lookups def load_model( diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 69cec7d3d..7d8dfd5d6 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -28,7 +28,7 @@ cdef class Vocab: cpdef readonly StringStore strings cpdef public Morphology morphology cpdef public object vectors - cpdef public object lookups + cpdef public object _lookups cpdef public object writing_system cpdef public object get_noun_chunks cdef readonly int length diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 94289036a..ce104d9db 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -417,13 +417,18 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors - def load_lookups(self, lookups): - self.lookups = lookups - if lookups.has_table("lexeme_norm"): - self.lex_attr_getters[NORM] = util.add_lookups( - self.lex_attr_getters[NORM], - lookups.get_table("lexeme_norm"), - ) + property lookups: + def __get__(self): + return self._lookups + + def __set__(self, lookups): + self._lookups = lookups + if lookups.has_table("lexeme_norm"): + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), + self.lookups.get_table("lexeme_norm"), + ) + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory.