Merge pull request #6092 from adrianeboyd/bugfix/load-vocab-lookups-2

This commit is contained in:
Ines Montani 2020-09-19 12:33:38 +02:00 committed by GitHub
commit e863b3dc14
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 37 additions and 17 deletions

View File

@ -87,6 +87,7 @@ def train(
sourced_components = get_sourced_components(config)
with show_validation_error(config_path):
nlp, config = util.load_model_from_config(config)
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
if config["training"]["vectors"] is not None:
util.load_vectors_into_model(nlp, config["training"]["vectors"])
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)

View File

@ -12,7 +12,6 @@ gpu_allocator = null
lang = null
pipeline = []
disabled = []
load_vocab_data = true
before_creation = null
after_creation = null
after_pipeline_creation = null
@ -59,6 +58,7 @@ accumulate_gradient = 1
init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}
vectors = null
lookups = null
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
max_epochs = 0

View File

@ -31,6 +31,7 @@ from .schemas import ConfigSchema
from .git_info import GIT_VERSION
from . import util
from . import about
from .lookups import load_lookups
# This is the base config will all settings (training etc.)
@ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
lookups = load_lookups(lang=lang, tables=tables)
return lookups
class Language:
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
@ -152,7 +160,6 @@ class Language:
self.lang,
self.Defaults,
vectors_name=vectors_name,
load_data=self._config["nlp"]["load_vocab_data"],
)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):

View File

@ -8,6 +8,7 @@ from collections import defaultdict
from thinc.api import Optimizer
from .attrs import NAMES
from .lookups import Lookups
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
class ConfigSchemaTraining(BaseModel):
# fmt: off
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data")
@ -228,7 +230,6 @@ class ConfigSchemaNlp(BaseModel):
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
tokenizer: Callable = Field(..., title="The tokenizer to use")
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")

View File

@ -69,7 +69,6 @@ def test_util_dot_section():
[nlp]
lang = "en"
pipeline = ["textcat"]
load_vocab_data = false
[components]
@ -95,15 +94,13 @@ def test_util_dot_section():
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert not en_config["nlp"]["load_vocab_data"]
assert nl_config["nlp"]["load_vocab_data"] # default value True
assert en_config["nlp"]["pipeline"] == ["textcat"]
assert nl_config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.unknownattribute")
assert not dot_to_object(en_config, "nlp.load_vocab_data")
assert dot_to_object(nl_config, "nlp.load_vocab_data")
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)

View File

@ -253,6 +253,14 @@ def load_vectors_into_model(
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def load_vocab_data_into_model(
nlp: "Language", *, lookups: Optional["Lookups"]=None
) -> None:
"""Load vocab data."""
if lookups:
nlp.vocab.lookups = lookups
def load_model(
name: Union[str, Path],
*,

View File

@ -28,7 +28,7 @@ cdef class Vocab:
cpdef readonly StringStore strings
cpdef public Morphology morphology
cpdef public object vectors
cpdef public object lookups
cpdef public object _lookups
cpdef public object writing_system
cpdef public object get_noun_chunks
cdef readonly int length

View File

@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, vectors_name=None, load_data=True):
def create_vocab(lang, defaults, vectors_name=None):
# If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available
if load_data:
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
lookups = load_lookups(lang, tables=tables, strict=False)
else:
lookups = Lookups()
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
# This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS,
lookups.get_table("lexeme_norm", {}),
)
return Vocab(
lex_attr_getters=lex_attrs,
lookups=lookups,
writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name,
@ -424,6 +417,19 @@ cdef class Vocab:
orth = self.strings.add(orth)
return orth in self.vectors
property lookups:
def __get__(self):
return self._lookups
def __set__(self, lookups):
self._lookups = lookups
if lookups.has_table("lexeme_norm"):
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
self.lookups.get_table("lexeme_norm"),
)
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.