Proof of concept for telling factories and tokenizer we're loading

This commit is contained in:
Matthew Honnibal 2020-09-26 21:10:21 +02:00
parent 702edf52a0
commit b397aedcdc
3 changed files with 37 additions and 9 deletions

View File

@ -31,7 +31,7 @@ from .schemas import ConfigSchema
from .git_info import GIT_VERSION from .git_info import GIT_VERSION
from . import util from . import util
from . import about from . import about
from .lookups import load_lookups from .lookups import load_lookups, Lookups
# This is the base config will all settings (training etc.) # This is the base config will all settings (training etc.)
@ -68,9 +68,14 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
""" """
def tokenizer_factory(nlp: "Language") -> Tokenizer: def tokenizer_factory(nlp: "Language") -> Tokenizer:
prefixes = nlp.Defaults.prefixes if nlp._context != "loading":
suffixes = nlp.Defaults.suffixes prefixes = nlp.Defaults.prefixes
infixes = nlp.Defaults.infixes suffixes = nlp.Defaults.suffixes
infixes = nlp.Defaults.infixes
else:
prefixes = None
suffixes = None
infixes = None
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
@ -86,11 +91,14 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory return tokenizer_factory
_CONTEXT = ""
@registry.misc("spacy.LookupsDataLoader.v1") @registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables): def load_lookups_data(lang, tables):
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
lookups = load_lookups(lang=lang, tables=tables) if _CONTEXT != "loading":
lookups = load_lookups(lang=lang, tables=tables)
else:
lookups = Lookups()
return lookups return lookups
@ -111,6 +119,7 @@ class Language:
factories = SimpleFrozenDict(error=Errors.E957) factories = SimpleFrozenDict(error=Errors.E957)
_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory _factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
_context: str
def __init__( def __init__(
self, self,
@ -119,6 +128,7 @@ class Language:
max_length: int = 10 ** 6, max_length: int = 10 ** 6,
meta: Dict[str, Any] = {}, meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
_context: str = "",
**kwargs, **kwargs,
) -> None: ) -> None:
"""Initialise a Language object. """Initialise a Language object.
@ -139,6 +149,8 @@ class Language:
DOCS: https://nightly.spacy.io/api/language#init DOCS: https://nightly.spacy.io/api/language#init
""" """
global _CONTEXT
_CONTEXT = _context
# We're only calling this to import all factories provided via entry # We're only calling this to import all factories provided via entry
# points. The factory decorator applied to these functions takes care # points. The factory decorator applied to these functions takes care
# of the rest. # of the rest.
@ -148,6 +160,7 @@ class Language:
self._meta = dict(meta) self._meta = dict(meta)
self._path = None self._path = None
self._optimizer = None self._optimizer = None
self._context = _context
# Component meta and configs are only needed on the instance # Component meta and configs are only needed on the instance
self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
self._pipe_configs: Dict[str, Config] = {} # config by component self._pipe_configs: Dict[str, Config] = {} # config by component
@ -1474,6 +1487,7 @@ class Language:
meta: Dict[str, Any] = SimpleFrozenDict(), meta: Dict[str, Any] = SimpleFrozenDict(),
auto_fill: bool = True, auto_fill: bool = True,
validate: bool = True, validate: bool = True,
_context: str = ""
) -> "Language": ) -> "Language":
"""Create the nlp object from a loaded config. Will set up the tokenizer """Create the nlp object from a loaded config. Will set up the tokenizer
and language data, add pipeline components etc. If no config is provided, and language data, add pipeline components etc. If no config is provided,
@ -1540,7 +1554,12 @@ class Language:
# inside stuff like the spacy train function. If we loaded them here, # inside stuff like the spacy train function. If we loaded them here,
# then we would load them twice at runtime: once when we make from config, # then we would load them twice at runtime: once when we make from config,
# and then again when we load from disk. # and then again when we load from disk.
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta) nlp = lang_cls(
vocab=vocab,
create_tokenizer=create_tokenizer,
meta=meta,
_context=_context
)
if after_creation is not None: if after_creation is not None:
nlp = after_creation(nlp) nlp = after_creation(nlp)
if not isinstance(nlp, cls): if not isinstance(nlp, cls):
@ -1637,6 +1656,7 @@ class Language:
DOCS: https://nightly.spacy.io/api/language#from_disk DOCS: https://nightly.spacy.io/api/language#from_disk
""" """
global _CONTEXT
def deserialize_meta(path: Path) -> None: def deserialize_meta(path: Path) -> None:
if path.exists(): if path.exists():
@ -1675,6 +1695,8 @@ class Language:
util.from_disk(path, deserializers, exclude) util.from_disk(path, deserializers, exclude)
self._path = path self._path = path
self._link_components() self._link_components()
self._context = ""
_CONTEXT = ""
return self return self
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
@ -1736,6 +1758,7 @@ class Language:
) )
util.from_bytes(bytes_data, deserializers, exclude) util.from_bytes(bytes_data, deserializers, exclude)
self._link_components() self._link_components()
self._context = ""
return self return self

View File

@ -31,7 +31,8 @@ def make_lemmatizer(
lookups: Optional[Lookups], lookups: Optional[Lookups],
overwrite: bool = False, overwrite: bool = False,
): ):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) if nlp._context != "loading":
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer( return Lemmatizer(
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
) )

View File

@ -345,14 +345,16 @@ def load_model_from_path(
keyed by section values in dot notation. keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
from . import language
if not model_path.exists(): if not model_path.exists():
raise IOError(Errors.E052.format(path=model_path)) raise IOError(Errors.E052.format(path=model_path))
if not meta: if not meta:
meta = get_model_meta(model_path) meta = get_model_meta(model_path)
config_path = model_path / "config.cfg" config_path = model_path / "config.cfg"
config = load_config(config_path, overrides=dict_to_dot(config)) config = load_config(config_path, overrides=dict_to_dot(config))
language._CONTEXT = "loading"
nlp, _ = load_model_from_config( nlp, _ = load_model_from_config(
config, vocab=vocab, disable=disable, exclude=exclude config, vocab=vocab, disable=disable, exclude=exclude, _context="loading"
) )
return nlp.from_disk(model_path, exclude=exclude) return nlp.from_disk(model_path, exclude=exclude)
@ -365,6 +367,7 @@ def load_model_from_config(
exclude: Iterable[str] = SimpleFrozenList(), exclude: Iterable[str] = SimpleFrozenList(),
auto_fill: bool = False, auto_fill: bool = False,
validate: bool = True, validate: bool = True,
_context: str = ""
) -> Tuple["Language", Config]: ) -> Tuple["Language", Config]:
"""Create an nlp object from a config. Expects the full config file including """Create an nlp object from a config. Expects the full config file including
a section "nlp" containing the settings for the nlp object. a section "nlp" containing the settings for the nlp object.
@ -397,6 +400,7 @@ def load_model_from_config(
exclude=exclude, exclude=exclude,
auto_fill=auto_fill, auto_fill=auto_fill,
validate=validate, validate=validate,
_context=_context
) )
return nlp, nlp.resolved return nlp, nlp.resolved