mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Various updates/additions to CLI scripts (#5362)
* `debug-data`: determine coverage of provided vectors * `evaluate`: support `blank:lg` model to make it possible to just evaluate tokenization * `init-model`: add option to truncate vectors to N most frequent vectors from word2vec file * `train`: * if training on GPU, only run evaluation/timing on CPU in the first iteration * if training is aborted, exit with a non-0 exit status
This commit is contained in:
parent
cfdaf99b80
commit
bdff76dede
|
@ -108,9 +108,11 @@ def debug_data(
|
|||
msg.good("Corpus is loadable")
|
||||
|
||||
# Create all gold data here to avoid iterating over the train_docs constantly
|
||||
gold_train_data = _compile_gold(train_docs, pipeline)
|
||||
gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
|
||||
gold_dev_data = _compile_gold(dev_docs, pipeline)
|
||||
gold_train_data = _compile_gold(train_docs, pipeline, nlp)
|
||||
gold_train_unpreprocessed_data = _compile_gold(
|
||||
train_docs_unpreprocessed, pipeline, nlp
|
||||
)
|
||||
gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
|
||||
|
||||
train_texts = gold_train_data["texts"]
|
||||
dev_texts = gold_dev_data["texts"]
|
||||
|
@ -182,6 +184,16 @@ def debug_data(
|
|||
nlp.vocab.vectors_length,
|
||||
)
|
||||
)
|
||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||
msg.warn(
|
||||
"{} words in training data without vectors ({:0.2f}%)".format(
|
||||
n_missing_vectors,
|
||||
n_missing_vectors / gold_train_data["n_words"],
|
||||
),
|
||||
)
|
||||
msg.text(
|
||||
"10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.info("No word vectors present in the model")
|
||||
|
||||
|
@ -562,7 +574,7 @@ def _load_file(file_path, msg):
|
|||
)
|
||||
|
||||
|
||||
def _compile_gold(train_docs, pipeline):
|
||||
def _compile_gold(train_docs, pipeline, nlp):
|
||||
data = {
|
||||
"ner": Counter(),
|
||||
"cats": Counter(),
|
||||
|
@ -574,6 +586,7 @@ def _compile_gold(train_docs, pipeline):
|
|||
"punct_ents": 0,
|
||||
"n_words": 0,
|
||||
"n_misaligned_words": 0,
|
||||
"words_missing_vectors": Counter(),
|
||||
"n_sents": 0,
|
||||
"n_nonproj": 0,
|
||||
"n_cycles": 0,
|
||||
|
@ -586,6 +599,10 @@ def _compile_gold(train_docs, pipeline):
|
|||
data["n_words"] += len(valid_words)
|
||||
data["n_misaligned_words"] += len(gold.words) - len(valid_words)
|
||||
data["texts"].add(doc.text)
|
||||
if len(nlp.vocab.vectors):
|
||||
for word in valid_words:
|
||||
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||
data["words_missing_vectors"].update([word])
|
||||
if "ner" in pipeline:
|
||||
for i, label in enumerate(gold.ner):
|
||||
if label is None:
|
||||
|
@ -636,7 +653,11 @@ def _format_labels(labels, counts=False):
|
|||
def _get_examples_without_label(data, label):
|
||||
count = 0
|
||||
for doc, gold in data:
|
||||
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
|
||||
labels = [
|
||||
label.split("-")[1]
|
||||
for label in gold.ner
|
||||
if label is not None and label not in ("O", "-")
|
||||
]
|
||||
if label not in labels:
|
||||
count += 1
|
||||
return count
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import spacy
|
||||
from timeit import default_timer as timer
|
||||
from wasabi import msg
|
||||
|
||||
|
@ -43,7 +44,10 @@ def evaluate(
|
|||
if displacy_path and not displacy_path.exists():
|
||||
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
||||
corpus = GoldCorpus(data_path, data_path)
|
||||
nlp = util.load_model(model)
|
||||
if model.startswith("blank:"):
|
||||
nlp = spacy.blank(model.replace("blank:", ""))
|
||||
else:
|
||||
nlp = util.load_model(model)
|
||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||
begin = timer()
|
||||
scorer = nlp.evaluate(dev_docs, verbose=False)
|
||||
|
|
|
@ -35,6 +35,12 @@ DEFAULT_OOV_PROB = -20
|
|||
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
||||
truncate_vectors=(
|
||||
"Optional number of vectors to truncate to when reading in vectors file",
|
||||
"option",
|
||||
"t",
|
||||
int,
|
||||
),
|
||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||
vectors_name=(
|
||||
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
|
||||
|
@ -51,6 +57,7 @@ def init_model(
|
|||
clusters_loc=None,
|
||||
jsonl_loc=None,
|
||||
vectors_loc=None,
|
||||
truncate_vectors=0,
|
||||
prune_vectors=-1,
|
||||
vectors_name=None,
|
||||
model_name=None,
|
||||
|
@ -88,7 +95,7 @@ def init_model(
|
|||
nlp = create_model(lang, lex_attrs, name=model_name)
|
||||
msg.good("Successfully created model")
|
||||
if vectors_loc is not None:
|
||||
add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
|
||||
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
|
||||
vec_added = len(nlp.vocab.vectors)
|
||||
lex_added = len(nlp.vocab)
|
||||
msg.good(
|
||||
|
@ -169,7 +176,7 @@ def create_model(lang, lex_attrs, name=None):
|
|||
return nlp
|
||||
|
||||
|
||||
def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
||||
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||
|
@ -179,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
|||
else:
|
||||
if vectors_loc:
|
||||
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc)
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
|
||||
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
|
@ -199,9 +206,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
|||
nlp.vocab.prune_vectors(prune_vectors)
|
||||
|
||||
|
||||
def read_vectors(vectors_loc):
|
||||
def read_vectors(vectors_loc, truncate_vectors=0):
|
||||
f = open_file(vectors_loc)
|
||||
shape = tuple(int(size) for size in next(f).split())
|
||||
if truncate_vectors >= 1:
|
||||
shape = (truncate_vectors, shape[1])
|
||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||
vectors_keys = []
|
||||
for i, line in enumerate(tqdm(f)):
|
||||
|
@ -212,6 +221,8 @@ def read_vectors(vectors_loc):
|
|||
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
||||
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
||||
vectors_keys.append(word)
|
||||
if i == truncate_vectors - 1:
|
||||
break
|
||||
return vectors_data, vectors_keys
|
||||
|
||||
|
||||
|
|
|
@ -454,22 +454,25 @@ def train(
|
|||
cpu_wps = nwords / (end_time - start_time)
|
||||
else:
|
||||
gpu_wps = nwords / (end_time - start_time)
|
||||
with Model.use_device("cpu"):
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
for name, component in nlp_loaded.pipeline:
|
||||
if hasattr(component, "cfg"):
|
||||
component.cfg["beam_width"] = beam_width
|
||||
dev_docs = list(
|
||||
corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=gold_preproc,
|
||||
ignore_misaligned=True,
|
||||
# Only evaluate on CPU in the first iteration (for
|
||||
# timing) if GPU is enabled
|
||||
if i >= 1:
|
||||
with Model.use_device("cpu"):
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
for name, component in nlp_loaded.pipeline:
|
||||
if hasattr(component, "cfg"):
|
||||
component.cfg["beam_width"] = beam_width
|
||||
dev_docs = list(
|
||||
corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=gold_preproc,
|
||||
ignore_misaligned=True,
|
||||
)
|
||||
)
|
||||
)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
|
||||
end_time = timer()
|
||||
cpu_wps = nwords / (end_time - start_time)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
|
||||
end_time = timer()
|
||||
cpu_wps = nwords / (end_time - start_time)
|
||||
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
||||
srsly.write_json(acc_loc, scorer.scores)
|
||||
|
||||
|
@ -550,7 +553,8 @@ def train(
|
|||
except Exception as e:
|
||||
msg.warn(
|
||||
"Aborting and saving the final best model. "
|
||||
"Encountered exception: {}".format(e)
|
||||
"Encountered exception: {}".format(e),
|
||||
exits=1,
|
||||
)
|
||||
finally:
|
||||
best_pipes = nlp.pipe_names
|
||||
|
|
|
@ -547,7 +547,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
|||
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
||||
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
|
||||
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
||||
| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||
| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
|
||||
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
||||
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user