Merge pull request #6092 from adrianeboyd/bugfix/load-vocab-lookups-2

This commit is contained in:
Ines Montani 2020-09-19 12:33:38 +02:00 committed by GitHub
commit e863b3dc14
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 37 additions and 17 deletions

View File

@ -87,6 +87,7 @@ def train(
sourced_components = get_sourced_components(config) sourced_components = get_sourced_components(config)
with show_validation_error(config_path): with show_validation_error(config_path):
nlp, config = util.load_model_from_config(config) nlp, config = util.load_model_from_config(config)
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
if config["training"]["vectors"] is not None: if config["training"]["vectors"] is not None:
util.load_vectors_into_model(nlp, config["training"]["vectors"]) util.load_vectors_into_model(nlp, config["training"]["vectors"])
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)

View File

@ -12,7 +12,6 @@ gpu_allocator = null
lang = null lang = null
pipeline = [] pipeline = []
disabled = [] disabled = []
load_vocab_data = true
before_creation = null before_creation = null
after_creation = null after_creation = null
after_pipeline_creation = null after_pipeline_creation = null
@ -59,6 +58,7 @@ accumulate_gradient = 1
init_tok2vec = ${paths.init_tok2vec} init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw} raw_text = ${paths.raw}
vectors = null vectors = null
lookups = null
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 1600
max_epochs = 0 max_epochs = 0

View File

@ -31,6 +31,7 @@ from .schemas import ConfigSchema
from .git_info import GIT_VERSION from .git_info import GIT_VERSION
from . import util from . import util
from . import about from . import about
from .lookups import load_lookups
# This is the base config will all settings (training etc.) # This is the base config will all settings (training etc.)
@ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory return tokenizer_factory
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
lookups = load_lookups(lang=lang, tables=tables)
return lookups
class Language: class Language:
"""A text-processing pipeline. Usually you'll load this once per process, """A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application. and pass the instance around your application.
@ -152,7 +160,6 @@ class Language:
self.lang, self.lang,
self.Defaults, self.Defaults,
vectors_name=vectors_name, vectors_name=vectors_name,
load_data=self._config["nlp"]["load_vocab_data"],
) )
else: else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang): if (self.lang and vocab.lang) and (self.lang != vocab.lang):

View File

@ -8,6 +8,7 @@ from collections import defaultdict
from thinc.api import Optimizer from thinc.api import Optimizer
from .attrs import NAMES from .attrs import NAMES
from .lookups import Lookups
if TYPE_CHECKING: if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports # This lets us add type hints for mypy etc. without causing circular imports
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
class ConfigSchemaTraining(BaseModel): class ConfigSchemaTraining(BaseModel):
# fmt: off # fmt: off
vectors: Optional[StrictStr] = Field(..., title="Path to vectors") vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
train_corpus: StrictStr = Field(..., title="Path in the config to the training data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data") batcher: Batcher = Field(..., title="Batcher for the training data")
@ -228,7 +230,6 @@ class ConfigSchemaNlp(BaseModel):
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default") disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
tokenizer: Callable = Field(..., title="The tokenizer to use") tokenizer: Callable = Field(..., title="The tokenizer to use")
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")

View File

@ -69,7 +69,6 @@ def test_util_dot_section():
[nlp] [nlp]
lang = "en" lang = "en"
pipeline = ["textcat"] pipeline = ["textcat"]
load_vocab_data = false
[components] [components]
@ -95,15 +94,13 @@ def test_util_dot_section():
# not exclusive_classes # not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten # Test that default values got overwritten
assert not en_config["nlp"]["load_vocab_data"] assert en_config["nlp"]["pipeline"] == ["textcat"]
assert nl_config["nlp"]["load_vocab_data"] # default value True assert nl_config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object' # Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError): with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.pipeline.tagger") dot_to_object(en_config, "nlp.pipeline.tagger")
with pytest.raises(KeyError): with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.unknownattribute") dot_to_object(en_config, "nlp.unknownattribute")
assert not dot_to_object(en_config, "nlp.load_vocab_data")
assert dot_to_object(nl_config, "nlp.load_vocab_data")
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)

View File

@ -253,6 +253,14 @@ def load_vectors_into_model(
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def load_vocab_data_into_model(
nlp: "Language", *, lookups: Optional["Lookups"]=None
) -> None:
"""Load vocab data."""
if lookups:
nlp.vocab.lookups = lookups
def load_model( def load_model(
name: Union[str, Path], name: Union[str, Path],
*, *,

View File

@ -28,7 +28,7 @@ cdef class Vocab:
cpdef readonly StringStore strings cpdef readonly StringStore strings
cpdef public Morphology morphology cpdef public Morphology morphology
cpdef public object vectors cpdef public object vectors
cpdef public object lookups cpdef public object _lookups
cpdef public object writing_system cpdef public object writing_system
cpdef public object get_noun_chunks cpdef public object get_noun_chunks
cdef readonly int length cdef readonly int length

View File

@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, vectors_name=None, load_data=True): def create_vocab(lang, defaults, vectors_name=None):
# If the spacy-lookups-data package is installed, we pre-populate the lookups # If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available # with lexeme data, if available
if load_data:
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
lookups = load_lookups(lang, tables=tables, strict=False)
else:
lookups = Lookups()
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
# This is messy, but it's the minimal working fix to Issue #639. # This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words) lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
lex_attrs[NORM] = util.add_lookups( lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]), lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS, BASE_NORMS,
lookups.get_table("lexeme_norm", {}),
) )
return Vocab( return Vocab(
lex_attr_getters=lex_attrs, lex_attr_getters=lex_attrs,
lookups=lookups,
writing_system=defaults.writing_system, writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name, vectors_name=vectors_name,
@ -424,6 +417,19 @@ cdef class Vocab:
orth = self.strings.add(orth) orth = self.strings.add(orth)
return orth in self.vectors return orth in self.vectors
property lookups:
def __get__(self):
return self._lookups
def __set__(self, lookups):
self._lookups = lookups
if lookups.has_table("lexeme_norm"):
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
self.lookups.get_table("lexeme_norm"),
)
def to_disk(self, path, *, exclude=tuple()): def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory. """Save the current state to a directory.