mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Various updates/additions to CLI scripts (#5362)
* `debug-data`: determine coverage of provided vectors * `evaluate`: support `blank:lg` model to make it possible to just evaluate tokenization * `init-model`: add option to truncate vectors to N most frequent vectors from word2vec file * `train`: * if training on GPU, only run evaluation/timing on CPU in the first iteration * if training is aborted, exit with a non-0 exit status
This commit is contained in:
parent
cfdaf99b80
commit
bdff76dede
|
@ -108,9 +108,11 @@ def debug_data(
|
||||||
msg.good("Corpus is loadable")
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_docs constantly
|
# Create all gold data here to avoid iterating over the train_docs constantly
|
||||||
gold_train_data = _compile_gold(train_docs, pipeline)
|
gold_train_data = _compile_gold(train_docs, pipeline, nlp)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
|
gold_train_unpreprocessed_data = _compile_gold(
|
||||||
gold_dev_data = _compile_gold(dev_docs, pipeline)
|
train_docs_unpreprocessed, pipeline, nlp
|
||||||
|
)
|
||||||
|
gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
|
@ -182,6 +184,16 @@ def debug_data(
|
||||||
nlp.vocab.vectors_length,
|
nlp.vocab.vectors_length,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||||
|
msg.warn(
|
||||||
|
"{} words in training data without vectors ({:0.2f}%)".format(
|
||||||
|
n_missing_vectors,
|
||||||
|
n_missing_vectors / gold_train_data["n_words"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
msg.text(
|
||||||
|
"10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the model")
|
msg.info("No word vectors present in the model")
|
||||||
|
|
||||||
|
@ -562,7 +574,7 @@ def _load_file(file_path, msg):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _compile_gold(train_docs, pipeline):
|
def _compile_gold(train_docs, pipeline, nlp):
|
||||||
data = {
|
data = {
|
||||||
"ner": Counter(),
|
"ner": Counter(),
|
||||||
"cats": Counter(),
|
"cats": Counter(),
|
||||||
|
@ -574,6 +586,7 @@ def _compile_gold(train_docs, pipeline):
|
||||||
"punct_ents": 0,
|
"punct_ents": 0,
|
||||||
"n_words": 0,
|
"n_words": 0,
|
||||||
"n_misaligned_words": 0,
|
"n_misaligned_words": 0,
|
||||||
|
"words_missing_vectors": Counter(),
|
||||||
"n_sents": 0,
|
"n_sents": 0,
|
||||||
"n_nonproj": 0,
|
"n_nonproj": 0,
|
||||||
"n_cycles": 0,
|
"n_cycles": 0,
|
||||||
|
@ -586,6 +599,10 @@ def _compile_gold(train_docs, pipeline):
|
||||||
data["n_words"] += len(valid_words)
|
data["n_words"] += len(valid_words)
|
||||||
data["n_misaligned_words"] += len(gold.words) - len(valid_words)
|
data["n_misaligned_words"] += len(gold.words) - len(valid_words)
|
||||||
data["texts"].add(doc.text)
|
data["texts"].add(doc.text)
|
||||||
|
if len(nlp.vocab.vectors):
|
||||||
|
for word in valid_words:
|
||||||
|
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||||
|
data["words_missing_vectors"].update([word])
|
||||||
if "ner" in pipeline:
|
if "ner" in pipeline:
|
||||||
for i, label in enumerate(gold.ner):
|
for i, label in enumerate(gold.ner):
|
||||||
if label is None:
|
if label is None:
|
||||||
|
@ -636,7 +653,11 @@ def _format_labels(labels, counts=False):
|
||||||
def _get_examples_without_label(data, label):
|
def _get_examples_without_label(data, label):
|
||||||
count = 0
|
count = 0
|
||||||
for doc, gold in data:
|
for doc, gold in data:
|
||||||
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
|
labels = [
|
||||||
|
label.split("-")[1]
|
||||||
|
for label in gold.ner
|
||||||
|
if label is not None and label not in ("O", "-")
|
||||||
|
]
|
||||||
if label not in labels:
|
if label not in labels:
|
||||||
count += 1
|
count += 1
|
||||||
return count
|
return count
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals, division, print_function
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
|
import spacy
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
|
@ -43,6 +44,9 @@ def evaluate(
|
||||||
if displacy_path and not displacy_path.exists():
|
if displacy_path and not displacy_path.exists():
|
||||||
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
||||||
corpus = GoldCorpus(data_path, data_path)
|
corpus = GoldCorpus(data_path, data_path)
|
||||||
|
if model.startswith("blank:"):
|
||||||
|
nlp = spacy.blank(model.replace("blank:", ""))
|
||||||
|
else:
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||||
begin = timer()
|
begin = timer()
|
||||||
|
|
|
@ -35,6 +35,12 @@ DEFAULT_OOV_PROB = -20
|
||||||
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||||
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
||||||
|
truncate_vectors=(
|
||||||
|
"Optional number of vectors to truncate to when reading in vectors file",
|
||||||
|
"option",
|
||||||
|
"t",
|
||||||
|
int,
|
||||||
|
),
|
||||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||||
vectors_name=(
|
vectors_name=(
|
||||||
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
|
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
|
||||||
|
@ -51,6 +57,7 @@ def init_model(
|
||||||
clusters_loc=None,
|
clusters_loc=None,
|
||||||
jsonl_loc=None,
|
jsonl_loc=None,
|
||||||
vectors_loc=None,
|
vectors_loc=None,
|
||||||
|
truncate_vectors=0,
|
||||||
prune_vectors=-1,
|
prune_vectors=-1,
|
||||||
vectors_name=None,
|
vectors_name=None,
|
||||||
model_name=None,
|
model_name=None,
|
||||||
|
@ -88,7 +95,7 @@ def init_model(
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name)
|
nlp = create_model(lang, lex_attrs, name=model_name)
|
||||||
msg.good("Successfully created model")
|
msg.good("Successfully created model")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
|
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
|
||||||
vec_added = len(nlp.vocab.vectors)
|
vec_added = len(nlp.vocab.vectors)
|
||||||
lex_added = len(nlp.vocab)
|
lex_added = len(nlp.vocab)
|
||||||
msg.good(
|
msg.good(
|
||||||
|
@ -169,7 +176,7 @@ def create_model(lang, lex_attrs, name=None):
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||||
|
@ -179,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
||||||
else:
|
else:
|
||||||
if vectors_loc:
|
if vectors_loc:
|
||||||
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc)
|
vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
|
||||||
msg.good("Loaded vectors from {}".format(vectors_loc))
|
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = (None, None)
|
vectors_data, vector_keys = (None, None)
|
||||||
|
@ -199,9 +206,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
nlp.vocab.prune_vectors(prune_vectors)
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc):
|
def read_vectors(vectors_loc, truncate_vectors=0):
|
||||||
f = open_file(vectors_loc)
|
f = open_file(vectors_loc)
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
shape = tuple(int(size) for size in next(f).split())
|
||||||
|
if truncate_vectors >= 1:
|
||||||
|
shape = (truncate_vectors, shape[1])
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||||
vectors_keys = []
|
vectors_keys = []
|
||||||
for i, line in enumerate(tqdm(f)):
|
for i, line in enumerate(tqdm(f)):
|
||||||
|
@ -212,6 +221,8 @@ def read_vectors(vectors_loc):
|
||||||
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
||||||
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
||||||
vectors_keys.append(word)
|
vectors_keys.append(word)
|
||||||
|
if i == truncate_vectors - 1:
|
||||||
|
break
|
||||||
return vectors_data, vectors_keys
|
return vectors_data, vectors_keys
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -454,6 +454,9 @@ def train(
|
||||||
cpu_wps = nwords / (end_time - start_time)
|
cpu_wps = nwords / (end_time - start_time)
|
||||||
else:
|
else:
|
||||||
gpu_wps = nwords / (end_time - start_time)
|
gpu_wps = nwords / (end_time - start_time)
|
||||||
|
# Only evaluate on CPU in the first iteration (for
|
||||||
|
# timing) if GPU is enabled
|
||||||
|
if i >= 1:
|
||||||
with Model.use_device("cpu"):
|
with Model.use_device("cpu"):
|
||||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||||
for name, component in nlp_loaded.pipeline:
|
for name, component in nlp_loaded.pipeline:
|
||||||
|
@ -550,7 +553,8 @@ def train(
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Aborting and saving the final best model. "
|
"Aborting and saving the final best model. "
|
||||||
"Encountered exception: {}".format(e)
|
"Encountered exception: {}".format(e),
|
||||||
|
exits=1,
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
best_pipes = nlp.pipe_names
|
best_pipes = nlp.pipe_names
|
||||||
|
|
|
@ -547,7 +547,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
||||||
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
||||||
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
|
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
|
||||||
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
||||||
| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
|
||||||
|
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||||
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
||||||
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user