mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Merge pull request #6092 from adrianeboyd/bugfix/load-vocab-lookups-2
This commit is contained in:
commit
e863b3dc14
|
@ -87,6 +87,7 @@ def train(
|
|||
sourced_components = get_sourced_components(config)
|
||||
with show_validation_error(config_path):
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
|
||||
if config["training"]["vectors"] is not None:
|
||||
util.load_vectors_into_model(nlp, config["training"]["vectors"])
|
||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||
|
|
|
@ -12,7 +12,6 @@ gpu_allocator = null
|
|||
lang = null
|
||||
pipeline = []
|
||||
disabled = []
|
||||
load_vocab_data = true
|
||||
before_creation = null
|
||||
after_creation = null
|
||||
after_pipeline_creation = null
|
||||
|
@ -59,6 +58,7 @@ accumulate_gradient = 1
|
|||
init_tok2vec = ${paths.init_tok2vec}
|
||||
raw_text = ${paths.raw}
|
||||
vectors = null
|
||||
lookups = null
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
|
|
|
@ -31,6 +31,7 @@ from .schemas import ConfigSchema
|
|||
from .git_info import GIT_VERSION
|
||||
from . import util
|
||||
from . import about
|
||||
from .lookups import load_lookups
|
||||
|
||||
|
||||
# This is the base config will all settings (training etc.)
|
||||
|
@ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
return tokenizer_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
||||
class Language:
|
||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||
and pass the instance around your application.
|
||||
|
@ -152,7 +160,6 @@ class Language:
|
|||
self.lang,
|
||||
self.Defaults,
|
||||
vectors_name=vectors_name,
|
||||
load_data=self._config["nlp"]["load_vocab_data"],
|
||||
)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
|
|
|
@ -8,6 +8,7 @@ from collections import defaultdict
|
|||
from thinc.api import Optimizer
|
||||
|
||||
from .attrs import NAMES
|
||||
from .lookups import Lookups
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
|
@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
|
|||
class ConfigSchemaTraining(BaseModel):
|
||||
# fmt: off
|
||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
|
||||
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
|
@ -228,7 +230,6 @@ class ConfigSchemaNlp(BaseModel):
|
|||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
|
||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
|
||||
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
|
||||
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
|
||||
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
|
||||
|
|
|
@ -69,7 +69,6 @@ def test_util_dot_section():
|
|||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["textcat"]
|
||||
load_vocab_data = false
|
||||
|
||||
[components]
|
||||
|
||||
|
@ -95,15 +94,13 @@ def test_util_dot_section():
|
|||
# not exclusive_classes
|
||||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
||||
# Test that default values got overwritten
|
||||
assert not en_config["nlp"]["load_vocab_data"]
|
||||
assert nl_config["nlp"]["load_vocab_data"] # default value True
|
||||
assert en_config["nlp"]["pipeline"] == ["textcat"]
|
||||
assert nl_config["nlp"]["pipeline"] == [] # default value []
|
||||
# Test proper functioning of 'dot_to_object'
|
||||
with pytest.raises(KeyError):
|
||||
dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||
with pytest.raises(KeyError):
|
||||
dot_to_object(en_config, "nlp.unknownattribute")
|
||||
assert not dot_to_object(en_config, "nlp.load_vocab_data")
|
||||
assert dot_to_object(nl_config, "nlp.load_vocab_data")
|
||||
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
|
||||
|
||||
|
||||
|
|
|
@ -253,6 +253,14 @@ def load_vectors_into_model(
|
|||
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
|
||||
|
||||
|
||||
def load_vocab_data_into_model(
|
||||
nlp: "Language", *, lookups: Optional["Lookups"]=None
|
||||
) -> None:
|
||||
"""Load vocab data."""
|
||||
if lookups:
|
||||
nlp.vocab.lookups = lookups
|
||||
|
||||
|
||||
def load_model(
|
||||
name: Union[str, Path],
|
||||
*,
|
||||
|
|
|
@ -28,7 +28,7 @@ cdef class Vocab:
|
|||
cpdef readonly StringStore strings
|
||||
cpdef public Morphology morphology
|
||||
cpdef public object vectors
|
||||
cpdef public object lookups
|
||||
cpdef public object _lookups
|
||||
cpdef public object writing_system
|
||||
cpdef public object get_noun_chunks
|
||||
cdef readonly int length
|
||||
|
|
|
@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
|
|||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||
|
||||
|
||||
def create_vocab(lang, defaults, vectors_name=None, load_data=True):
|
||||
def create_vocab(lang, defaults, vectors_name=None):
|
||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||
# with lexeme data, if available
|
||||
if load_data:
|
||||
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
|
||||
lookups = load_lookups(lang, tables=tables, strict=False)
|
||||
else:
|
||||
lookups = Lookups()
|
||||
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
||||
# This is messy, but it's the minimal working fix to Issue #639.
|
||||
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
|
||||
|
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
|
|||
lex_attrs[NORM] = util.add_lookups(
|
||||
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
|
||||
BASE_NORMS,
|
||||
lookups.get_table("lexeme_norm", {}),
|
||||
)
|
||||
return Vocab(
|
||||
lex_attr_getters=lex_attrs,
|
||||
lookups=lookups,
|
||||
writing_system=defaults.writing_system,
|
||||
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
||||
vectors_name=vectors_name,
|
||||
|
@ -424,6 +417,19 @@ cdef class Vocab:
|
|||
orth = self.strings.add(orth)
|
||||
return orth in self.vectors
|
||||
|
||||
property lookups:
|
||||
def __get__(self):
|
||||
return self._lookups
|
||||
|
||||
def __set__(self, lookups):
|
||||
self._lookups = lookups
|
||||
if lookups.has_table("lexeme_norm"):
|
||||
self.lex_attr_getters[NORM] = util.add_lookups(
|
||||
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
|
||||
self.lookups.get_table("lexeme_norm"),
|
||||
)
|
||||
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user