mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-02 18:06:46 +03:00
Proof of concept for telling factories and tokenizer we're loading
This commit is contained in:
parent
702edf52a0
commit
b397aedcdc
|
@ -31,7 +31,7 @@ from .schemas import ConfigSchema
|
||||||
from .git_info import GIT_VERSION
|
from .git_info import GIT_VERSION
|
||||||
from . import util
|
from . import util
|
||||||
from . import about
|
from . import about
|
||||||
from .lookups import load_lookups
|
from .lookups import load_lookups, Lookups
|
||||||
|
|
||||||
|
|
||||||
# This is the base config will all settings (training etc.)
|
# This is the base config will all settings (training etc.)
|
||||||
|
@ -68,9 +68,14 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def tokenizer_factory(nlp: "Language") -> Tokenizer:
|
def tokenizer_factory(nlp: "Language") -> Tokenizer:
|
||||||
|
if nlp._context != "loading":
|
||||||
prefixes = nlp.Defaults.prefixes
|
prefixes = nlp.Defaults.prefixes
|
||||||
suffixes = nlp.Defaults.suffixes
|
suffixes = nlp.Defaults.suffixes
|
||||||
infixes = nlp.Defaults.infixes
|
infixes = nlp.Defaults.infixes
|
||||||
|
else:
|
||||||
|
prefixes = None
|
||||||
|
suffixes = None
|
||||||
|
infixes = None
|
||||||
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
|
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
|
||||||
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
|
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
|
||||||
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
|
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
|
||||||
|
@ -86,11 +91,14 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
|
|
||||||
return tokenizer_factory
|
return tokenizer_factory
|
||||||
|
|
||||||
|
_CONTEXT = ""
|
||||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||||
def load_lookups_data(lang, tables):
|
def load_lookups_data(lang, tables):
|
||||||
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||||
|
if _CONTEXT != "loading":
|
||||||
lookups = load_lookups(lang=lang, tables=tables)
|
lookups = load_lookups(lang=lang, tables=tables)
|
||||||
|
else:
|
||||||
|
lookups = Lookups()
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
|
|
||||||
|
@ -111,6 +119,7 @@ class Language:
|
||||||
|
|
||||||
factories = SimpleFrozenDict(error=Errors.E957)
|
factories = SimpleFrozenDict(error=Errors.E957)
|
||||||
_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
|
_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
|
||||||
|
_context: str
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -119,6 +128,7 @@ class Language:
|
||||||
max_length: int = 10 ** 6,
|
max_length: int = 10 ** 6,
|
||||||
meta: Dict[str, Any] = {},
|
meta: Dict[str, Any] = {},
|
||||||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||||||
|
_context: str = "",
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialise a Language object.
|
"""Initialise a Language object.
|
||||||
|
@ -139,6 +149,8 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#init
|
DOCS: https://nightly.spacy.io/api/language#init
|
||||||
"""
|
"""
|
||||||
|
global _CONTEXT
|
||||||
|
_CONTEXT = _context
|
||||||
# We're only calling this to import all factories provided via entry
|
# We're only calling this to import all factories provided via entry
|
||||||
# points. The factory decorator applied to these functions takes care
|
# points. The factory decorator applied to these functions takes care
|
||||||
# of the rest.
|
# of the rest.
|
||||||
|
@ -148,6 +160,7 @@ class Language:
|
||||||
self._meta = dict(meta)
|
self._meta = dict(meta)
|
||||||
self._path = None
|
self._path = None
|
||||||
self._optimizer = None
|
self._optimizer = None
|
||||||
|
self._context = _context
|
||||||
# Component meta and configs are only needed on the instance
|
# Component meta and configs are only needed on the instance
|
||||||
self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
|
self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
|
||||||
self._pipe_configs: Dict[str, Config] = {} # config by component
|
self._pipe_configs: Dict[str, Config] = {} # config by component
|
||||||
|
@ -1474,6 +1487,7 @@ class Language:
|
||||||
meta: Dict[str, Any] = SimpleFrozenDict(),
|
meta: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
auto_fill: bool = True,
|
auto_fill: bool = True,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
|
_context: str = ""
|
||||||
) -> "Language":
|
) -> "Language":
|
||||||
"""Create the nlp object from a loaded config. Will set up the tokenizer
|
"""Create the nlp object from a loaded config. Will set up the tokenizer
|
||||||
and language data, add pipeline components etc. If no config is provided,
|
and language data, add pipeline components etc. If no config is provided,
|
||||||
|
@ -1540,7 +1554,12 @@ class Language:
|
||||||
# inside stuff like the spacy train function. If we loaded them here,
|
# inside stuff like the spacy train function. If we loaded them here,
|
||||||
# then we would load them twice at runtime: once when we make from config,
|
# then we would load them twice at runtime: once when we make from config,
|
||||||
# and then again when we load from disk.
|
# and then again when we load from disk.
|
||||||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
|
nlp = lang_cls(
|
||||||
|
vocab=vocab,
|
||||||
|
create_tokenizer=create_tokenizer,
|
||||||
|
meta=meta,
|
||||||
|
_context=_context
|
||||||
|
)
|
||||||
if after_creation is not None:
|
if after_creation is not None:
|
||||||
nlp = after_creation(nlp)
|
nlp = after_creation(nlp)
|
||||||
if not isinstance(nlp, cls):
|
if not isinstance(nlp, cls):
|
||||||
|
@ -1637,6 +1656,7 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/language#from_disk
|
DOCS: https://nightly.spacy.io/api/language#from_disk
|
||||||
"""
|
"""
|
||||||
|
global _CONTEXT
|
||||||
|
|
||||||
def deserialize_meta(path: Path) -> None:
|
def deserialize_meta(path: Path) -> None:
|
||||||
if path.exists():
|
if path.exists():
|
||||||
|
@ -1675,6 +1695,8 @@ class Language:
|
||||||
util.from_disk(path, deserializers, exclude)
|
util.from_disk(path, deserializers, exclude)
|
||||||
self._path = path
|
self._path = path
|
||||||
self._link_components()
|
self._link_components()
|
||||||
|
self._context = ""
|
||||||
|
_CONTEXT = ""
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||||
|
@ -1736,6 +1758,7 @@ class Language:
|
||||||
)
|
)
|
||||||
util.from_bytes(bytes_data, deserializers, exclude)
|
util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
self._link_components()
|
self._link_components()
|
||||||
|
self._context = ""
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,7 @@ def make_lemmatizer(
|
||||||
lookups: Optional[Lookups],
|
lookups: Optional[Lookups],
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
):
|
):
|
||||||
|
if nlp._context != "loading":
|
||||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||||
return Lemmatizer(
|
return Lemmatizer(
|
||||||
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
|
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
|
||||||
|
|
|
@ -345,14 +345,16 @@ def load_model_from_path(
|
||||||
keyed by section values in dot notation.
|
keyed by section values in dot notation.
|
||||||
RETURNS (Language): The loaded nlp object.
|
RETURNS (Language): The loaded nlp object.
|
||||||
"""
|
"""
|
||||||
|
from . import language
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
raise IOError(Errors.E052.format(path=model_path))
|
raise IOError(Errors.E052.format(path=model_path))
|
||||||
if not meta:
|
if not meta:
|
||||||
meta = get_model_meta(model_path)
|
meta = get_model_meta(model_path)
|
||||||
config_path = model_path / "config.cfg"
|
config_path = model_path / "config.cfg"
|
||||||
config = load_config(config_path, overrides=dict_to_dot(config))
|
config = load_config(config_path, overrides=dict_to_dot(config))
|
||||||
|
language._CONTEXT = "loading"
|
||||||
nlp, _ = load_model_from_config(
|
nlp, _ = load_model_from_config(
|
||||||
config, vocab=vocab, disable=disable, exclude=exclude
|
config, vocab=vocab, disable=disable, exclude=exclude, _context="loading"
|
||||||
)
|
)
|
||||||
return nlp.from_disk(model_path, exclude=exclude)
|
return nlp.from_disk(model_path, exclude=exclude)
|
||||||
|
|
||||||
|
@ -365,6 +367,7 @@ def load_model_from_config(
|
||||||
exclude: Iterable[str] = SimpleFrozenList(),
|
exclude: Iterable[str] = SimpleFrozenList(),
|
||||||
auto_fill: bool = False,
|
auto_fill: bool = False,
|
||||||
validate: bool = True,
|
validate: bool = True,
|
||||||
|
_context: str = ""
|
||||||
) -> Tuple["Language", Config]:
|
) -> Tuple["Language", Config]:
|
||||||
"""Create an nlp object from a config. Expects the full config file including
|
"""Create an nlp object from a config. Expects the full config file including
|
||||||
a section "nlp" containing the settings for the nlp object.
|
a section "nlp" containing the settings for the nlp object.
|
||||||
|
@ -397,6 +400,7 @@ def load_model_from_config(
|
||||||
exclude=exclude,
|
exclude=exclude,
|
||||||
auto_fill=auto_fill,
|
auto_fill=auto_fill,
|
||||||
validate=validate,
|
validate=validate,
|
||||||
|
_context=_context
|
||||||
)
|
)
|
||||||
return nlp, nlp.resolved
|
return nlp, nlp.resolved
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user