mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Load vocab lookups tables at beginning of training
Similar to how vectors are handled, move the vocab lookups to be loaded at the start of training rather than when the vocab is initialized, since the vocab doesn't have access to the full config when it's created. The option moves from `nlp.load_vocab_data` to `training.lookups`. Typically these tables will come from `spacy-lookups-data`, but any `Lookups` object can be provided. The loading from `spacy-lookups-data` is now strict, so configs for each language should specify the exact tables required. This also makes it easier to control whether the larger clusters and probs tables are included. To load `lexeme_norm` from `spacy-lookups-data`: ``` [training.lookups] @misc = "spacy.LoadLookupsData.v1" lang = ${nlp.lang} tables = ["lexeme_norm"] ```
This commit is contained in:
parent
0406200a1e
commit
eed4b785f5
|
@ -88,6 +88,7 @@ def train(
|
|||
sourced_components = get_sourced_components(config)
|
||||
with show_validation_error(config_path):
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
|
||||
if config["training"]["vectors"] is not None:
|
||||
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||
|
|
|
@ -12,7 +12,6 @@ use_pytorch_for_gpu_memory = false
|
|||
lang = null
|
||||
pipeline = []
|
||||
disabled = []
|
||||
load_vocab_data = true
|
||||
before_creation = null
|
||||
after_creation = null
|
||||
after_pipeline_creation = null
|
||||
|
@ -58,6 +57,7 @@ accumulate_gradient = 1
|
|||
init_tok2vec = ${paths.init_tok2vec}
|
||||
raw_text = ${paths.raw}
|
||||
vectors = null
|
||||
lookups = null
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
|
|
|
@ -31,6 +31,7 @@ from .schemas import ConfigSchema
|
|||
from .git_info import GIT_VERSION
|
||||
from . import util
|
||||
from . import about
|
||||
from .lookups import load_lookups
|
||||
|
||||
|
||||
# This is the base config will all settings (training etc.)
|
||||
|
@ -86,6 +87,12 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
return tokenizer_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.LoadLookupsData.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
||||
class Language:
|
||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||
and pass the instance around your application.
|
||||
|
@ -152,7 +159,6 @@ class Language:
|
|||
self.lang,
|
||||
self.Defaults,
|
||||
vectors_name=vectors_name,
|
||||
load_data=self._config["nlp"]["load_vocab_data"],
|
||||
)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
|
|
|
@ -8,6 +8,7 @@ from collections import defaultdict
|
|||
from thinc.api import Optimizer
|
||||
|
||||
from .attrs import NAMES
|
||||
from .lookups import Lookups
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
|
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
|
|||
class ConfigSchemaTraining(BaseModel):
|
||||
# fmt: off
|
||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
|
||||
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
|
@ -227,7 +229,6 @@ class ConfigSchemaNlp(BaseModel):
|
|||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
|
||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
|
||||
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
|
||||
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
|
||||
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
|
||||
|
|
|
@ -69,7 +69,6 @@ def test_util_dot_section():
|
|||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["textcat"]
|
||||
load_vocab_data = false
|
||||
|
||||
[components]
|
||||
|
||||
|
@ -95,15 +94,13 @@ def test_util_dot_section():
|
|||
# not exclusive_classes
|
||||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
||||
# Test that default values got overwritten
|
||||
assert not en_config["nlp"]["load_vocab_data"]
|
||||
assert nl_config["nlp"]["load_vocab_data"] # default value True
|
||||
assert en_config["nlp"]["pipeline"] == ["textcat"]
|
||||
assert nl_config["nlp"]["pipeline"] == [] # default value []
|
||||
# Test proper functioning of 'dot_to_object'
|
||||
with pytest.raises(KeyError):
|
||||
dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||
with pytest.raises(KeyError):
|
||||
dot_to_object(en_config, "nlp.unknownattribute")
|
||||
assert not dot_to_object(en_config, "nlp.load_vocab_data")
|
||||
assert dot_to_object(nl_config, "nlp.load_vocab_data")
|
||||
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
|
||||
|
||||
|
||||
|
|
|
@ -253,6 +253,14 @@ def load_vectors_into_model(
|
|||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
||||
|
||||
|
||||
def load_vocab_data_into_model(
|
||||
nlp: "Language", *, lookups: Optional["Lookups"]=None
|
||||
) -> None:
|
||||
"""Load vocab data."""
|
||||
if lookups:
|
||||
nlp.vocab.load_lookups(lookups)
|
||||
|
||||
|
||||
def load_model(
|
||||
name: Union[str, Path],
|
||||
*,
|
||||
|
|
|
@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
|
|||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||
|
||||
|
||||
def create_vocab(lang, defaults, vectors_name=None, load_data=True):
|
||||
def create_vocab(lang, defaults, vectors_name=None):
|
||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||
# with lexeme data, if available
|
||||
if load_data:
|
||||
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
|
||||
lookups = load_lookups(lang, tables=tables, strict=False)
|
||||
else:
|
||||
lookups = Lookups()
|
||||
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
||||
# This is messy, but it's the minimal working fix to Issue #639.
|
||||
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
|
||||
|
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
|
|||
lex_attrs[NORM] = util.add_lookups(
|
||||
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
||||
BASE_NORMS,
|
||||
lookups.get_table("lexeme_norm", {}),
|
||||
)
|
||||
return Vocab(
|
||||
lex_attr_getters=lex_attrs,
|
||||
lookups=lookups,
|
||||
writing_system=defaults.writing_system,
|
||||
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
||||
vectors_name=vectors_name,
|
||||
|
@ -424,6 +417,14 @@ cdef class Vocab:
|
|||
orth = self.strings.add(orth)
|
||||
return orth in self.vectors
|
||||
|
||||
def load_lookups(self, lookups):
|
||||
self.lookups = lookups
|
||||
if lookups.has_table("lexeme_norm"):
|
||||
self.lex_attr_getters[NORM] = util.add_lookups(
|
||||
self.lex_attr_getters[NORM],
|
||||
lookups.get_table("lexeme_norm"),
|
||||
)
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user