Various updates/additions to CLI scripts (#5362)

* `debug-data`: determine coverage of provided vectors

* `evaluate`: support `blank:lg` model to make it possible to just evaluate
tokenization

* `init-model`: add option to truncate vectors to N most frequent vectors
from word2vec file

* `train`:

  * if training on GPU, only run evaluation/timing on CPU in the first
    iteration

  * if training is aborted, exit with a non-0 exit status
This commit is contained in:
adrianeboyd 2020-04-29 12:56:46 +02:00 committed by GitHub
parent cfdaf99b80
commit bdff76dede
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 68 additions and 27 deletions

View File

@ -108,9 +108,11 @@ def debug_data(
msg.good("Corpus is loadable")
# Create all gold data here to avoid iterating over the train_docs constantly
gold_train_data = _compile_gold(train_docs, pipeline)
gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
gold_dev_data = _compile_gold(dev_docs, pipeline)
gold_train_data = _compile_gold(train_docs, pipeline, nlp)
gold_train_unpreprocessed_data = _compile_gold(
train_docs_unpreprocessed, pipeline, nlp
)
gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
train_texts = gold_train_data["texts"]
dev_texts = gold_dev_data["texts"]
@ -182,6 +184,16 @@ def debug_data(
nlp.vocab.vectors_length,
)
)
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
msg.warn(
"{} words in training data without vectors ({:0.2f}%)".format(
n_missing_vectors,
n_missing_vectors / gold_train_data["n_words"],
),
)
msg.text(
"10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
)
else:
msg.info("No word vectors present in the model")
@ -562,7 +574,7 @@ def _load_file(file_path, msg):
)
def _compile_gold(train_docs, pipeline):
def _compile_gold(train_docs, pipeline, nlp):
data = {
"ner": Counter(),
"cats": Counter(),
@ -574,6 +586,7 @@ def _compile_gold(train_docs, pipeline):
"punct_ents": 0,
"n_words": 0,
"n_misaligned_words": 0,
"words_missing_vectors": Counter(),
"n_sents": 0,
"n_nonproj": 0,
"n_cycles": 0,
@ -586,6 +599,10 @@ def _compile_gold(train_docs, pipeline):
data["n_words"] += len(valid_words)
data["n_misaligned_words"] += len(gold.words) - len(valid_words)
data["texts"].add(doc.text)
if len(nlp.vocab.vectors):
for word in valid_words:
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
data["words_missing_vectors"].update([word])
if "ner" in pipeline:
for i, label in enumerate(gold.ner):
if label is None:
@ -636,7 +653,11 @@ def _format_labels(labels, counts=False):
def _get_examples_without_label(data, label):
count = 0
for doc, gold in data:
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
labels = [
label.split("-")[1]
for label in gold.ner
if label is not None and label not in ("O", "-")
]
if label not in labels:
count += 1
return count

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals, division, print_function
import plac
import spacy
from timeit import default_timer as timer
from wasabi import msg
@ -43,7 +44,10 @@ def evaluate(
if displacy_path and not displacy_path.exists():
msg.fail("Visualization output directory not found", displacy_path, exits=1)
corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model)
if model.startswith("blank:"):
nlp = spacy.blank(model.replace("blank:", ""))
else:
nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
begin = timer()
scorer = nlp.evaluate(dev_docs, verbose=False)

View File

@ -35,6 +35,12 @@ DEFAULT_OOV_PROB = -20
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
truncate_vectors=(
"Optional number of vectors to truncate to when reading in vectors file",
"option",
"t",
int,
),
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
vectors_name=(
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
@ -51,6 +57,7 @@ def init_model(
clusters_loc=None,
jsonl_loc=None,
vectors_loc=None,
truncate_vectors=0,
prune_vectors=-1,
vectors_name=None,
model_name=None,
@ -88,7 +95,7 @@ def init_model(
nlp = create_model(lang, lex_attrs, name=model_name)
msg.good("Successfully created model")
if vectors_loc is not None:
add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab)
msg.good(
@ -169,7 +176,7 @@ def create_model(lang, lex_attrs, name=None):
return nlp
def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -179,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
else:
if vectors_loc:
with msg.loading("Reading vectors from {}".format(vectors_loc)):
vectors_data, vector_keys = read_vectors(vectors_loc)
vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
msg.good("Loaded vectors from {}".format(vectors_loc))
else:
vectors_data, vector_keys = (None, None)
@ -199,9 +206,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
nlp.vocab.prune_vectors(prune_vectors)
def read_vectors(vectors_loc):
def read_vectors(vectors_loc, truncate_vectors=0):
f = open_file(vectors_loc)
shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1:
shape = (truncate_vectors, shape[1])
vectors_data = numpy.zeros(shape=shape, dtype="f")
vectors_keys = []
for i, line in enumerate(tqdm(f)):
@ -212,6 +221,8 @@ def read_vectors(vectors_loc):
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
vectors_data[i] = numpy.asarray(pieces, dtype="f")
vectors_keys.append(word)
if i == truncate_vectors - 1:
break
return vectors_data, vectors_keys

View File

@ -454,22 +454,25 @@ def train(
cpu_wps = nwords / (end_time - start_time)
else:
gpu_wps = nwords / (end_time - start_time)
with Model.use_device("cpu"):
nlp_loaded = util.load_model_from_path(epoch_model_path)
for name, component in nlp_loaded.pipeline:
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
dev_docs = list(
corpus.dev_docs(
nlp_loaded,
gold_preproc=gold_preproc,
ignore_misaligned=True,
# Only evaluate on CPU in the first iteration (for
# timing) if GPU is enabled
if i >= 1:
with Model.use_device("cpu"):
nlp_loaded = util.load_model_from_path(epoch_model_path)
for name, component in nlp_loaded.pipeline:
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
dev_docs = list(
corpus.dev_docs(
nlp_loaded,
gold_preproc=gold_preproc,
ignore_misaligned=True,
)
)
)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
end_time = timer()
cpu_wps = nwords / (end_time - start_time)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
end_time = timer()
cpu_wps = nwords / (end_time - start_time)
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
srsly.write_json(acc_loc, scorer.scores)
@ -550,7 +553,8 @@ def train(
except Exception as e:
msg.warn(
"Aborting and saving the final best model. "
"Encountered exception: {}".format(e)
"Encountered exception: {}".format(e),
exits=1,
)
finally:
best_pipes = nlp.pipe_names

View File

@ -547,7 +547,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
| **CREATES** | model | A spaCy model containing the vocab and vectors. |