Various updates/additions to CLI scripts (#5362)

* `debug-data`: determine coverage of provided vectors * `evaluate`: support `blank:lg` model to make it possible to just evaluate tokenization * `init-model`: add option to truncate vectors to N most frequent vectors from word2vec file * `train`: * if training on GPU, only run evaluation/timing on CPU in the first iteration * if training is aborted, exit with a non-0 exit status
2025-07-06 04:43:17 +03:00 · 2020-04-29 12:56:46 +02:00 · 2020-04-29 12:56:46 +02:00 · bdff76dede
commit bdff76dede
parent cfdaf99b80
5 changed files with 68 additions and 27 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -108,9 +108,11 @@ def debug_data(
    msg.good("Corpus is loadable")
    # Create all gold data here to avoid iterating over the train_docs constantly
-    gold_train_data = _compile_gold(train_docs, pipeline)
+    gold_train_data = _compile_gold(train_docs, pipeline, nlp)
-    gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
+    gold_train_unpreprocessed_data = _compile_gold(
-    gold_dev_data = _compile_gold(dev_docs, pipeline)
+        train_docs_unpreprocessed, pipeline, nlp
    )
    gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
@ -182,6 +184,16 @@ def debug_data(
                nlp.vocab.vectors_length,
            )
        )
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
        msg.warn(
            "{} words in training data without vectors ({:0.2f}%)".format(
                n_missing_vectors,
                n_missing_vectors / gold_train_data["n_words"],
            ),
        )
        msg.text(
            "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
        )
    else:
        msg.info("No word vectors present in the model")
@ -562,7 +574,7 @@ def _load_file(file_path, msg):
    )
-def _compile_gold(train_docs, pipeline):
+def _compile_gold(train_docs, pipeline, nlp):
    data = {
        "ner": Counter(),
        "cats": Counter(),
@ -574,6 +586,7 @@ def _compile_gold(train_docs, pipeline):
        "punct_ents": 0,
        "n_words": 0,
        "n_misaligned_words": 0,
        "words_missing_vectors": Counter(),
        "n_sents": 0,
        "n_nonproj": 0,
        "n_cycles": 0,
@ -586,6 +599,10 @@ def _compile_gold(train_docs, pipeline):
        data["n_words"] += len(valid_words)
        data["n_misaligned_words"] += len(gold.words) - len(valid_words)
        data["texts"].add(doc.text)
        if len(nlp.vocab.vectors):
            for word in valid_words:
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
                    data["words_missing_vectors"].update([word])
        if "ner" in pipeline:
            for i, label in enumerate(gold.ner):
                if label is None:
@ -636,7 +653,11 @@ def _format_labels(labels, counts=False):
 def _get_examples_without_label(data, label):
    count = 0
    for doc, gold in data:
-        labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
+        labels = [
            label.split("-")[1]
            for label in gold.ner
            if label is not None and label not in ("O", "-")
        ]
        if label not in labels:
            count += 1
    return count
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals, division, print_function
 import plac
 import spacy
 from timeit import default_timer as timer
 from wasabi import msg
@ -43,6 +44,9 @@ def evaluate(
    if displacy_path and not displacy_path.exists():
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
    corpus = GoldCorpus(data_path, data_path)
    if model.startswith("blank:"):
        nlp = spacy.blank(model.replace("blank:", ""))
    else:
        nlp = util.load_model(model)
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
    begin = timer()
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -35,6 +35,12 @@ DEFAULT_OOV_PROB = -20
    jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
    clusters_loc=("Optional location of brown clusters data", "option", "c", str),
    vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
    truncate_vectors=(
        "Optional number of vectors to truncate to when reading in vectors file",
        "option",
        "t",
        int,
    ),
    prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
    vectors_name=(
        "Optional name for the word vectors, e.g. en_core_web_lg.vectors",
@ -51,6 +57,7 @@ def init_model(
    clusters_loc=None,
    jsonl_loc=None,
    vectors_loc=None,
    truncate_vectors=0,
    prune_vectors=-1,
    vectors_name=None,
    model_name=None,
@ -88,7 +95,7 @@ def init_model(
        nlp = create_model(lang, lex_attrs, name=model_name)
    msg.good("Successfully created model")
    if vectors_loc is not None:
-        add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
+        add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
@ -169,7 +176,7 @@ def create_model(lang, lex_attrs, name=None):
    return nlp
-def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
+def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
    vectors_loc = ensure_path(vectors_loc)
    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -179,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
    else:
        if vectors_loc:
            with msg.loading("Reading vectors from {}".format(vectors_loc)):
-                vectors_data, vector_keys = read_vectors(vectors_loc)
+                vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
            msg.good("Loaded vectors from {}".format(vectors_loc))
        else:
            vectors_data, vector_keys = (None, None)
@ -199,9 +206,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
        nlp.vocab.prune_vectors(prune_vectors)
-def read_vectors(vectors_loc):
+def read_vectors(vectors_loc, truncate_vectors=0):
    f = open_file(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
    if truncate_vectors >= 1:
        shape = (truncate_vectors, shape[1])
    vectors_data = numpy.zeros(shape=shape, dtype="f")
    vectors_keys = []
    for i, line in enumerate(tqdm(f)):
@ -212,6 +221,8 @@ def read_vectors(vectors_loc):
            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
        vectors_data[i] = numpy.asarray(pieces, dtype="f")
        vectors_keys.append(word)
        if i == truncate_vectors - 1:
            break
    return vectors_data, vectors_keys
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -454,6 +454,9 @@ def train(
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        # Only evaluate on CPU in the first iteration (for
                        # timing) if GPU is enabled
                        if i >= 1:
                            with Model.use_device("cpu"):
                                nlp_loaded = util.load_model_from_path(epoch_model_path)
                                for name, component in nlp_loaded.pipeline:
@ -550,7 +553,8 @@ def train(
    except Exception as e:
        msg.warn(
            "Aborting and saving the final best model. "
-            "Encountered exception: {}".format(e)
+            "Encountered exception: {}".format(e),
            exits=1,
        )
    finally:
        best_pipes = nlp.pipe_names
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -547,7 +547,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
 | `output_dir`            | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                           |
 | `--jsonl-loc`, `-j`     | option     | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes.                                                                                                                                           |
 | `--vectors-loc`, `-v`   | option     | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
-| `--prune-vectors`, `-V` | flag       | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
+| `--truncate-vectors`, `-t` | option  | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation.                                                                                                                                                      |
 | `--prune-vectors`, `-V` | option     | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
 | `--vectors-name`, `-vn` | option     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`.                                                                                                                                                                  |
 | **CREATES**             | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                        |