diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 955b420aa..8953ac6be 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -35,6 +35,7 @@ msg = Printer() clusters_loc=("Optional location of brown clusters data", "option", "c", str), vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str), prune_vectors=("Optional number of vectors to prune to", "option", "V", int), + vectors_name=("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "vn", str) ) def init_model( lang, @@ -44,6 +45,7 @@ def init_model( jsonl_loc=None, vectors_loc=None, prune_vectors=-1, + vectors_name=None ): """ Create a new model from raw data, like word frequencies, Brown clusters @@ -78,7 +80,7 @@ def init_model( nlp = create_model(lang, lex_attrs) msg.good("Successfully created model") if vectors_loc is not None: - add_vectors(nlp, vectors_loc, prune_vectors) + add_vectors(nlp, vectors_loc, prune_vectors, vectors_name) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( @@ -160,7 +162,7 @@ def create_model(lang, lex_attrs): return nlp -def add_vectors(nlp, vectors_loc, prune_vectors): +def add_vectors(nlp, vectors_loc, prune_vectors, name=None): vectors_loc = ensure_path(vectors_loc) if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) @@ -181,7 +183,10 @@ def add_vectors(nlp, vectors_loc, prune_vectors): lexeme.is_oov = False if vectors_data is not None: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) - nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"] + if name is None: + nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"] + else: + nlp.vocab.vectors.name = name nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name if prune_vectors >= 1: nlp.vocab.prune_vectors(prune_vectors) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 5d42f6fb8..8c6caa443 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -538,6 +538,7 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] | `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | | `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | | `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | | **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"}