Various updates/additions to CLI scripts (#5362)

* `debug-data`: determine coverage of provided vectors * `evaluate`: support `blank:lg` model to make it possible to just evaluate tokenization * `init-model`: add option to truncate vectors to N most frequent vectors from word2vec file * `train`: * if training on GPU, only run evaluation/timing on CPU in the first iteration * if training is aborted, exit with a non-0 exit status
2025-08-23 21:44:54 +03:00 · 2020-04-29 12:56:46 +02:00 · 2020-04-29 12:56:46 +02:00 · bdff76dede
commit bdff76dede
parent cfdaf99b80
5 changed files with 68 additions and 27 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -108,9 +108,11 @@ def debug_data(
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_docs constantly
-    gold_train_data = _compile_gold(train_docs, pipeline)
-    gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
-    gold_dev_data = _compile_gold(dev_docs, pipeline)
+    gold_train_data = _compile_gold(train_docs, pipeline, nlp)
+    gold_train_unpreprocessed_data = _compile_gold(
+        train_docs_unpreprocessed, pipeline, nlp
+    )
+    gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
@ -182,6 +184,16 @@ def debug_data(
                nlp.vocab.vectors_length,
            )
        )
+        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
+        msg.warn(
+            "{} words in training data without vectors ({:0.2f}%)".format(
+                n_missing_vectors,
+                n_missing_vectors / gold_train_data["n_words"],
+            ),
+        )
+        msg.text(
+            "10 most common words without vectors: {}".format(_format_labels(gold_train_data["words_missing_vectors"].most_common(10), counts=True)), show=verbose,
+        )
    else:
        msg.info("No word vectors present in the model")

@ -562,7 +574,7 @@ def _load_file(file_path, msg):
    )


-def _compile_gold(train_docs, pipeline):
+def _compile_gold(train_docs, pipeline, nlp):
    data = {
        "ner": Counter(),
        "cats": Counter(),
@ -574,6 +586,7 @@ def _compile_gold(train_docs, pipeline):
        "punct_ents": 0,
        "n_words": 0,
        "n_misaligned_words": 0,
+        "words_missing_vectors": Counter(),
        "n_sents": 0,
        "n_nonproj": 0,
        "n_cycles": 0,
@ -586,6 +599,10 @@ def _compile_gold(train_docs, pipeline):
        data["n_words"] += len(valid_words)
        data["n_misaligned_words"] += len(gold.words) - len(valid_words)
        data["texts"].add(doc.text)
+        if len(nlp.vocab.vectors):
+            for word in valid_words:
+                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
+                    data["words_missing_vectors"].update([word])
        if "ner" in pipeline:
            for i, label in enumerate(gold.ner):
                if label is None:
@ -636,7 +653,11 @@ def _format_labels(labels, counts=False):
 def _get_examples_without_label(data, label):
    count = 0
    for doc, gold in data:
-        labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
+        labels = [
+            label.split("-")[1]
+            for label in gold.ner
+            if label is not None and label not in ("O", "-")
+        ]
        if label not in labels:
            count += 1
    return count
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals, division, print_function

 import plac
+import spacy
 from timeit import default_timer as timer
 from wasabi import msg

@ -43,7 +44,10 @@ def evaluate(
    if displacy_path and not displacy_path.exists():
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
    corpus = GoldCorpus(data_path, data_path)
-    nlp = util.load_model(model)
+    if model.startswith("blank:"):
+        nlp = spacy.blank(model.replace("blank:", ""))
+    else:
+        nlp = util.load_model(model)
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
    begin = timer()
    scorer = nlp.evaluate(dev_docs, verbose=False)
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -35,6 +35,12 @@ DEFAULT_OOV_PROB = -20
    jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
    clusters_loc=("Optional location of brown clusters data", "option", "c", str),
    vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
+    truncate_vectors=(
+        "Optional number of vectors to truncate to when reading in vectors file",
+        "option",
+        "t",
+        int,
+    ),
    prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
    vectors_name=(
        "Optional name for the word vectors, e.g. en_core_web_lg.vectors",
@ -51,6 +57,7 @@ def init_model(
    clusters_loc=None,
    jsonl_loc=None,
    vectors_loc=None,
+    truncate_vectors=0,
    prune_vectors=-1,
    vectors_name=None,
    model_name=None,
@ -88,7 +95,7 @@ def init_model(
        nlp = create_model(lang, lex_attrs, name=model_name)
    msg.good("Successfully created model")
    if vectors_loc is not None:
-        add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
+        add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
@ -169,7 +176,7 @@ def create_model(lang, lex_attrs, name=None):
    return nlp


-def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
+def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
    vectors_loc = ensure_path(vectors_loc)
    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -179,7 +186,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
    else:
        if vectors_loc:
            with msg.loading("Reading vectors from {}".format(vectors_loc)):
-                vectors_data, vector_keys = read_vectors(vectors_loc)
+                vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
            msg.good("Loaded vectors from {}".format(vectors_loc))
        else:
            vectors_data, vector_keys = (None, None)
@ -199,9 +206,11 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
        nlp.vocab.prune_vectors(prune_vectors)


-def read_vectors(vectors_loc):
+def read_vectors(vectors_loc, truncate_vectors=0):
    f = open_file(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
+    if truncate_vectors >= 1:
+        shape = (truncate_vectors, shape[1])
    vectors_data = numpy.zeros(shape=shape, dtype="f")
    vectors_keys = []
    for i, line in enumerate(tqdm(f)):
@ -212,6 +221,8 @@ def read_vectors(vectors_loc):
            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
        vectors_data[i] = numpy.asarray(pieces, dtype="f")
        vectors_keys.append(word)
+        if i == truncate_vectors - 1:
+            break
    return vectors_data, vectors_keys


--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -454,22 +454,25 @@ def train(
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
-                        with Model.use_device("cpu"):
-                            nlp_loaded = util.load_model_from_path(epoch_model_path)
-                            for name, component in nlp_loaded.pipeline:
-                                if hasattr(component, "cfg"):
-                                    component.cfg["beam_width"] = beam_width
-                            dev_docs = list(
-                                corpus.dev_docs(
-                                    nlp_loaded,
-                                    gold_preproc=gold_preproc,
-                                    ignore_misaligned=True,
+                        # Only evaluate on CPU in the first iteration (for
+                        # timing) if GPU is enabled
+                        if i >= 1:
+                            with Model.use_device("cpu"):
+                                nlp_loaded = util.load_model_from_path(epoch_model_path)
+                                for name, component in nlp_loaded.pipeline:
+                                    if hasattr(component, "cfg"):
+                                        component.cfg["beam_width"] = beam_width
+                                dev_docs = list(
+                                    corpus.dev_docs(
+                                        nlp_loaded,
+                                        gold_preproc=gold_preproc,
+                                        ignore_misaligned=True,
+                                    )
                                )
-                            )
-                            start_time = timer()
-                            scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
-                            end_time = timer()
-                            cpu_wps = nwords / (end_time - start_time)
+                                start_time = timer()
+                                scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
+                                end_time = timer()
+                                cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

@ -550,7 +553,8 @@ def train(
    except Exception as e:
        msg.warn(
            "Aborting and saving the final best model. "
-            "Encountered exception: {}".format(e)
+            "Encountered exception: {}".format(e),
+            exits=1,
        )
    finally:
        best_pipes = nlp.pipe_names
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -547,7 +547,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
 | `output_dir`            | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                           |
 | `--jsonl-loc`, `-j`     | option     | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes.                                                                                                                                           |
 | `--vectors-loc`, `-v`   | option     | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
-| `--prune-vectors`, `-V` | flag       | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
+| `--truncate-vectors`, `-t` | option  | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation.                                                                                                                                                      |
+| `--prune-vectors`, `-V` | option     | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
 | `--vectors-name`, `-vn` | option     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`.                                                                                                                                                                  |
 | **CREATES**             | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                        |