Fix spacy vocab command

2026-01-05 16:29:18 +03:00 · 2017-10-30 18:38:41 +01:00 · 2017-10-30 18:38:41 +01:00 · 98c35d2585
commit 98c35d2585
parent d0cf12c8c7
2 changed files with 23 additions and 17 deletions
--- a/spacy/main.py
+++ b/spacy/main.py
@ -19,7 +19,7 @@ if __name__ == '__main__':
        'convert': convert,
        'package': package,
        'model': model,
-        'model': vocab,
+        'vocab': vocab,
        'profile': profile,
        'validate': validate
    }
--- a/spacy/cli/vocab.py
+++ b/spacy/cli/vocab.py
@ -1,31 +1,33 @@
-'''Compile a vocabulary from a lexicon jsonl file and word vectors.'''
 # coding: utf8
 from __future__ import unicode_literals

-from pathlib import Path
 import plac
 import json
 import spacy
 import numpy
-from spacy.util import ensure_path
+from pathlib import Path
+
+from ..util import prints, ensure_path


@plac.annotations(
    lang=("model language", "positional", None, str),
-    output_dir=("output directory to store model in", "positional", None, str),
+    output_dir=("model output directory", "positional", None, Path),
    lexemes_loc=("location of JSONL-formatted lexical data", "positional",
-                None, str),
-    vectors_loc=("location of vectors data, as numpy .npz (optional)",
-              "positional", None, str),
-    version=("Model version", "option", "V", str),
-)
-def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None):
-    out_dir = ensure_path(output_dir)
-    jsonl_loc = ensure_path(lexemes_loc)
+                 None, Path),
+    vectors_loc=("optional: location of vectors data, as numpy .npz",
+                 "positional", None, str))
+def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None):
+    """Compile a vocabulary from a lexicon jsonl file and word vectors."""
+    if not lexemes_loc.exists():
+        prints(lexemes_loc, title="Can't find lexical data", exits=1)
+    vectors_loc = ensure_path(vectors_loc)
    nlp = spacy.blank(lang)
    for word in nlp.vocab:
        word.rank = 0
-    with jsonl_loc.open() as file_:
+    lex_added = 0
+    vec_added = 0
+    with lexemes_loc.open() as file_:
        for line in file_:
            if line.strip():
                attrs = json.loads(line)
@ -35,14 +37,18 @@ def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None):
                    lex = nlp.vocab[attrs['orth']]
                    lex.set_attrs(**attrs)
                    assert lex.rank == attrs['id']
+                lex_added += 1
    if vectors_loc is not None:
        vector_data = numpy.load(open(vectors_loc, 'rb'))
        nlp.vocab.clear_vectors(width=vector_data.shape[1])
-        added = 0
        for word in nlp.vocab:
            if word.rank:
                nlp.vocab.vectors.add(word.orth_, row=word.rank,
                                      vector=vector_data[word.rank])
-                added += 1
-    nlp.to_disk(out_dir)
+                vec_added += 1
+    if not output_dir.exists():
+        output_dir.mkdir()
+    nlp.to_disk(output_dir)
+    prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
+           title="Sucessfully compiled vocab and vectors, and saved model")
    return nlp