Merge pull request #6104 from svlandeg/fix/debug_model [ci skip]

This commit is contained in:
Ines Montani 2020-09-22 09:31:23 +02:00 committed by GitHub
commit 81606b29bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 61 additions and 18 deletions

View File

@ -1,5 +1,9 @@
from typing import Dict, Any, Optional import warnings
from typing import Dict, Any, Optional, Iterable
from pathlib import Path from pathlib import Path
from spacy.training import Example
from spacy.util import dot_to_object
from wasabi import msg from wasabi import msg
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from thinc.api import Model, data_validation, set_gpu_allocator from thinc.api import Model, data_validation, set_gpu_allocator
@ -59,23 +63,24 @@ def debug_model_cli(
allocator = config["training"]["gpu_allocator"] allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator: if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator) set_gpu_allocator(allocator)
nlp, config = util.load_model_from_config(config_path) nlp, config = util.load_model_from_config(config)
seed = config["training"]["seed"] seed = config["training"]["seed"]
if seed is not None: if seed is not None:
msg.info(f"Fixing random seed: {seed}") msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed) fix_random_seed(seed)
pipe = nlp.get_pipe(component) pipe = nlp.get_pipe(component)
if hasattr(pipe, "model"): if not hasattr(pipe, "model"):
model = pipe.model
else:
msg.fail( msg.fail(
f"The component '{component}' does not specify an object that holds a Model.", f"The component '{component}' does not specify an object that holds a Model.",
exits=1, exits=1,
) )
debug_model(model, print_settings=print_settings) model = pipe.model
debug_model(config, nlp, model, print_settings=print_settings)
def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None): def debug_model(
config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
):
if not isinstance(model, Model): if not isinstance(model, Model):
msg.fail( msg.fail(
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.", f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@ -92,10 +97,23 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
# STEP 1: Initializing the model and printing again # STEP 1: Initializing the model and printing again
X = _get_docs() X = _get_docs()
Y = _get_output(model.ops.xp)
# The output vector might differ from the official type of the output layer # The output vector might differ from the official type of the output layer
with data_validation(False): with data_validation(False):
model.initialize(X=X, Y=Y) try:
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
nlp.begin_training(lambda: train_corpus(nlp))
msg.info("Initialized the model with the training corpus.")
except ValueError:
try:
_set_output_dim(nO=7, model=model)
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
msg.info("Initialized the model with dummy data.")
except:
msg.fail(
"Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
exits=1,
)
if print_settings.get("print_after_init"): if print_settings.get("print_after_init"):
msg.divider(f"STEP 1 - after initialization") msg.divider(f"STEP 1 - after initialization")
_print_model(model, print_settings) _print_model(model, print_settings)
@ -103,9 +121,18 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
# STEP 2: Updating the model and printing again # STEP 2: Updating the model and printing again
optimizer = Adam(0.001) optimizer = Adam(0.001)
set_dropout_rate(model, 0.2) set_dropout_rate(model, 0.2)
# ugly hack to deal with Tok2Vec listeners
tok2vec = None
if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
tok2vec = nlp.get_pipe("tok2vec")
goldY = None
for e in range(3): for e in range(3):
Y, get_dX = model.begin_update(_get_docs()) if tok2vec:
dY = get_gradient(model, Y) tok2vec.predict(X)
Y, get_dX = model.begin_update(X)
if goldY is None:
goldY = _simulate_gold(Y)
dY = get_gradient(goldY, Y, model.ops)
get_dX(dY) get_dX(dY)
model.finish_update(optimizer) model.finish_update(optimizer)
if print_settings.get("print_after_training"): if print_settings.get("print_after_training"):
@ -113,15 +140,25 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
_print_model(model, print_settings) _print_model(model, print_settings)
# STEP 3: the final prediction # STEP 3: the final prediction
prediction = model.predict(_get_docs()) prediction = model.predict(X)
if print_settings.get("print_prediction"): if print_settings.get("print_prediction"):
msg.divider(f"STEP 3 - prediction") msg.divider(f"STEP 3 - prediction")
msg.info(str(prediction)) msg.info(str(prediction))
msg.good(f"Succesfully ended analysis - model looks good.")
def get_gradient(model, Y):
goldY = _get_output(model.ops.xp) def get_gradient(goldY, Y, ops):
return Y - goldY return ops.asarray(Y) - ops.asarray(goldY)
def _simulate_gold(element, counter=1):
if isinstance(element, Iterable):
for i in range(len(element)):
element[i] = _simulate_gold(element[i], counter + i)
return element
else:
return 1 / counter
def _sentences(): def _sentences():
@ -138,8 +175,13 @@ def _get_docs(lang: str = "en"):
return list(nlp.pipe(_sentences())) return list(nlp.pipe(_sentences()))
def _get_output(xp): def _set_output_dim(model, nO):
return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32") # simulating dim inference by directly setting the nO argument of the model
if model.has_dim("nO") is None:
model.set_dim("nO", nO)
if model.has_ref("output_layer"):
if model.get_ref("output_layer").has_dim("nO") is None:
model.get_ref("output_layer").set_dim("nO", nO)
def _print_model(model, print_settings): def _print_model(model, print_settings):

View File

@ -545,7 +545,8 @@ class Errors:
E949 = ("Can only create an alignment when the texts are the same.") E949 = ("Can only create an alignment when the texts are the same.")
E952 = ("The section '{name}' is not a valid section in the provided config.") E952 = ("The section '{name}' is not a valid section in the provided config.")
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive a valid input.") E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
"component.")
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.") E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
E956 = ("Can't find component '{name}' in [components] block in the config. " E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}") "Available components: {opts}")