Allow vectors name to be set in init-model (#4321)

* Allow vectors name to be specified in init-model

* Document --vectors-name argument to init-model

* Update website/docs/api/cli.md

Co-Authored-By: Ines Montani <ines@ines.io>
This commit is contained in:
Matthew Honnibal 2019-09-25 13:11:00 +02:00 committed by GitHub
parent 09816f8323
commit 92ed4dc5e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 3 deletions

View File

@ -35,6 +35,7 @@ msg = Printer()
clusters_loc=("Optional location of brown clusters data", "option", "c", str), clusters_loc=("Optional location of brown clusters data", "option", "c", str),
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str), vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
prune_vectors=("Optional number of vectors to prune to", "option", "V", int), prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
vectors_name=("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "vn", str)
) )
def init_model( def init_model(
lang, lang,
@ -44,6 +45,7 @@ def init_model(
jsonl_loc=None, jsonl_loc=None,
vectors_loc=None, vectors_loc=None,
prune_vectors=-1, prune_vectors=-1,
vectors_name=None
): ):
""" """
Create a new model from raw data, like word frequencies, Brown clusters Create a new model from raw data, like word frequencies, Brown clusters
@ -78,7 +80,7 @@ def init_model(
nlp = create_model(lang, lex_attrs) nlp = create_model(lang, lex_attrs)
msg.good("Successfully created model") msg.good("Successfully created model")
if vectors_loc is not None: if vectors_loc is not None:
add_vectors(nlp, vectors_loc, prune_vectors) add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
vec_added = len(nlp.vocab.vectors) vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab) lex_added = len(nlp.vocab)
msg.good( msg.good(
@ -160,7 +162,7 @@ def create_model(lang, lex_attrs):
return nlp return nlp
def add_vectors(nlp, vectors_loc, prune_vectors): def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -181,7 +183,10 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
lexeme.is_oov = False lexeme.is_oov = False
if vectors_data is not None: if vectors_data is not None:
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"] if name is None:
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
else:
nlp.vocab.vectors.name = name
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune_vectors >= 1: if prune_vectors >= 1:
nlp.vocab.prune_vectors(prune_vectors) nlp.vocab.prune_vectors(prune_vectors)

View File

@ -538,6 +538,7 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | | `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | | `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | | `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
| **CREATES** | model | A spaCy model containing the vocab and vectors. | | **CREATES** | model | A spaCy model containing the vocab and vectors. |
## Evaluate {#evaluate new="2"} ## Evaluate {#evaluate new="2"}