diff --git a/spacy/cli/train.py b/spacy/cli/train.py index debecd0b1..bf3749c9e 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -87,6 +87,7 @@ def train( sourced_components = get_sourced_components(config) with show_validation_error(config_path): nlp, config = util.load_model_from_config(config) + util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"]) if config["training"]["vectors"] is not None: util.load_vectors_into_model(nlp, config["training"]["vectors"]) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index f4a453f2a..5cd97a0eb 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -12,7 +12,6 @@ gpu_allocator = null lang = null pipeline = [] disabled = [] -load_vocab_data = true before_creation = null after_creation = null after_pipeline_creation = null @@ -59,6 +58,7 @@ accumulate_gradient = 1 init_tok2vec = ${paths.init_tok2vec} raw_text = ${paths.raw} vectors = null +lookups = null # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 max_epochs = 0 diff --git a/spacy/language.py b/spacy/language.py index d530e6b92..7d463731a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -31,6 +31,7 @@ from .schemas import ConfigSchema from .git_info import GIT_VERSION from . import util from . import about +from .lookups import load_lookups # This is the base config will all settings (training etc.) @@ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: return tokenizer_factory +@registry.misc("spacy.LookupsDataLoader.v1") +def load_lookups_data(lang, tables): + util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") + lookups = load_lookups(lang=lang, tables=tables) + return lookups + + class Language: """A text-processing pipeline. Usually you'll load this once per process, and pass the instance around your application. @@ -152,7 +160,6 @@ class Language: self.lang, self.Defaults, vectors_name=vectors_name, - load_data=self._config["nlp"]["load_vocab_data"], ) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): diff --git a/spacy/schemas.py b/spacy/schemas.py index db71af9ca..60655da8c 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -8,6 +8,7 @@ from collections import defaultdict from thinc.api import Optimizer from .attrs import NAMES +from .lookups import Lookups if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports @@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel): class ConfigSchemaTraining(BaseModel): # fmt: off vectors: Optional[StrictStr] = Field(..., title="Path to vectors") + lookups: Optional[Lookups] = Field(..., title="Vocab lookups") dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") @@ -228,7 +230,6 @@ class ConfigSchemaNlp(BaseModel): pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default") tokenizer: Callable = Field(..., title="The tokenizer to use") - load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 1f073ab32..8c931d31e 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -69,7 +69,6 @@ def test_util_dot_section(): [nlp] lang = "en" pipeline = ["textcat"] - load_vocab_data = false [components] @@ -95,15 +94,13 @@ def test_util_dot_section(): # not exclusive_classes assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False # Test that default values got overwritten - assert not en_config["nlp"]["load_vocab_data"] - assert nl_config["nlp"]["load_vocab_data"] # default value True + assert en_config["nlp"]["pipeline"] == ["textcat"] + assert nl_config["nlp"]["pipeline"] == [] # default value [] # Test proper functioning of 'dot_to_object' with pytest.raises(KeyError): dot_to_object(en_config, "nlp.pipeline.tagger") with pytest.raises(KeyError): dot_to_object(en_config, "nlp.unknownattribute") - assert not dot_to_object(en_config, "nlp.load_vocab_data") - assert dot_to_object(nl_config, "nlp.load_vocab_data") assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) diff --git a/spacy/util.py b/spacy/util.py index 18b34e4d6..88162b23a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -253,6 +253,14 @@ def load_vectors_into_model( nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) +def load_vocab_data_into_model( + nlp: "Language", *, lookups: Optional["Lookups"]=None +) -> None: + """Load vocab data.""" + if lookups: + nlp.vocab.lookups = lookups + + def load_model( name: Union[str, Path], *, diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 69cec7d3d..7d8dfd5d6 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -28,7 +28,7 @@ cdef class Vocab: cpdef readonly StringStore strings cpdef public Morphology morphology cpdef public object vectors - cpdef public object lookups + cpdef public object _lookups cpdef public object writing_system cpdef public object get_noun_chunks cdef readonly int length diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ef0847e54..ce104d9db 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, vectors_name=None, load_data=True): +def create_vocab(lang, defaults, vectors_name=None): # If the spacy-lookups-data package is installed, we pre-populate the lookups # with lexeme data, if available - if load_data: - tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] - lookups = load_lookups(lang, tables=tables, strict=False) - else: - lookups = Lookups() lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} # This is messy, but it's the minimal working fix to Issue #639. lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words) @@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True): lex_attrs[NORM] = util.add_lookups( lex_attrs.get(NORM, LEX_ATTRS[NORM]), BASE_NORMS, - lookups.get_table("lexeme_norm", {}), ) return Vocab( lex_attr_getters=lex_attrs, - lookups=lookups, writing_system=defaults.writing_system, get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), vectors_name=vectors_name, @@ -424,6 +417,19 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors + property lookups: + def __get__(self): + return self._lookups + + def __set__(self, lookups): + self._lookups = lookups + if lookups.has_table("lexeme_norm"): + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), + self.lookups.get_table("lexeme_norm"), + ) + + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory.