From b397aedcdc79c70c5c38a625fe8082eabed5ef32 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 26 Sep 2020 21:10:21 +0200 Subject: [PATCH] Proof of concept for telling factories and tokenizer we're loading --- spacy/language.py | 37 +++++++++++++++++++++++++++++------- spacy/pipeline/lemmatizer.py | 3 ++- spacy/util.py | 6 +++++- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index a52391419..3bc930b22 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -31,7 +31,7 @@ from .schemas import ConfigSchema from .git_info import GIT_VERSION from . import util from . import about -from .lookups import load_lookups +from .lookups import load_lookups, Lookups # This is the base config will all settings (training etc.) @@ -68,9 +68,14 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: """ def tokenizer_factory(nlp: "Language") -> Tokenizer: - prefixes = nlp.Defaults.prefixes - suffixes = nlp.Defaults.suffixes - infixes = nlp.Defaults.infixes + if nlp._context != "loading": + prefixes = nlp.Defaults.prefixes + suffixes = nlp.Defaults.suffixes + infixes = nlp.Defaults.infixes + else: + prefixes = None + suffixes = None + infixes = None prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None @@ -86,11 +91,14 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: return tokenizer_factory - +_CONTEXT = "" @registry.misc("spacy.LookupsDataLoader.v1") def load_lookups_data(lang, tables): util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") - lookups = load_lookups(lang=lang, tables=tables) + if _CONTEXT != "loading": + lookups = load_lookups(lang=lang, tables=tables) + else: + lookups = Lookups() return lookups @@ -111,6 +119,7 @@ class Language: factories = SimpleFrozenDict(error=Errors.E957) _factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory + _context: str def __init__( self, @@ -119,6 +128,7 @@ class Language: max_length: int = 10 ** 6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, + _context: str = "", **kwargs, ) -> None: """Initialise a Language object. @@ -139,6 +149,8 @@ class Language: DOCS: https://nightly.spacy.io/api/language#init """ + global _CONTEXT + _CONTEXT = _context # We're only calling this to import all factories provided via entry # points. The factory decorator applied to these functions takes care # of the rest. @@ -148,6 +160,7 @@ class Language: self._meta = dict(meta) self._path = None self._optimizer = None + self._context = _context # Component meta and configs are only needed on the instance self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component self._pipe_configs: Dict[str, Config] = {} # config by component @@ -1474,6 +1487,7 @@ class Language: meta: Dict[str, Any] = SimpleFrozenDict(), auto_fill: bool = True, validate: bool = True, + _context: str = "" ) -> "Language": """Create the nlp object from a loaded config. Will set up the tokenizer and language data, add pipeline components etc. If no config is provided, @@ -1540,7 +1554,12 @@ class Language: # inside stuff like the spacy train function. If we loaded them here, # then we would load them twice at runtime: once when we make from config, # and then again when we load from disk. - nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta) + nlp = lang_cls( + vocab=vocab, + create_tokenizer=create_tokenizer, + meta=meta, + _context=_context + ) if after_creation is not None: nlp = after_creation(nlp) if not isinstance(nlp, cls): @@ -1637,6 +1656,7 @@ class Language: DOCS: https://nightly.spacy.io/api/language#from_disk """ + global _CONTEXT def deserialize_meta(path: Path) -> None: if path.exists(): @@ -1675,6 +1695,8 @@ class Language: util.from_disk(path, deserializers, exclude) self._path = path self._link_components() + self._context = "" + _CONTEXT = "" return self def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: @@ -1736,6 +1758,7 @@ class Language: ) util.from_bytes(bytes_data, deserializers, exclude) self._link_components() + self._context = "" return self diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index c30d09f62..15eda0204 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -31,7 +31,8 @@ def make_lemmatizer( lookups: Optional[Lookups], overwrite: bool = False, ): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + if nlp._context != "loading": + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) return Lemmatizer( nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite ) diff --git a/spacy/util.py b/spacy/util.py index 378ec2823..3dfc2e9a4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -345,14 +345,16 @@ def load_model_from_path( keyed by section values in dot notation. RETURNS (Language): The loaded nlp object. """ + from . import language if not model_path.exists(): raise IOError(Errors.E052.format(path=model_path)) if not meta: meta = get_model_meta(model_path) config_path = model_path / "config.cfg" config = load_config(config_path, overrides=dict_to_dot(config)) + language._CONTEXT = "loading" nlp, _ = load_model_from_config( - config, vocab=vocab, disable=disable, exclude=exclude + config, vocab=vocab, disable=disable, exclude=exclude, _context="loading" ) return nlp.from_disk(model_path, exclude=exclude) @@ -365,6 +367,7 @@ def load_model_from_config( exclude: Iterable[str] = SimpleFrozenList(), auto_fill: bool = False, validate: bool = True, + _context: str = "" ) -> Tuple["Language", Config]: """Create an nlp object from a config. Expects the full config file including a section "nlp" containing the settings for the nlp object. @@ -397,6 +400,7 @@ def load_model_from_config( exclude=exclude, auto_fill=auto_fill, validate=validate, + _context=_context ) return nlp, nlp.resolved