mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Allow vectors name to be set in init-model (#4321)
* Allow vectors name to be specified in init-model * Document --vectors-name argument to init-model * Update website/docs/api/cli.md Co-Authored-By: Ines Montani <ines@ines.io>
This commit is contained in:
parent
09816f8323
commit
92ed4dc5e0
|
@ -35,6 +35,7 @@ msg = Printer()
|
|||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||
vectors_name=("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "vn", str)
|
||||
)
|
||||
def init_model(
|
||||
lang,
|
||||
|
@ -44,6 +45,7 @@ def init_model(
|
|||
jsonl_loc=None,
|
||||
vectors_loc=None,
|
||||
prune_vectors=-1,
|
||||
vectors_name=None
|
||||
):
|
||||
"""
|
||||
Create a new model from raw data, like word frequencies, Brown clusters
|
||||
|
@ -78,7 +80,7 @@ def init_model(
|
|||
nlp = create_model(lang, lex_attrs)
|
||||
msg.good("Successfully created model")
|
||||
if vectors_loc is not None:
|
||||
add_vectors(nlp, vectors_loc, prune_vectors)
|
||||
add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
|
||||
vec_added = len(nlp.vocab.vectors)
|
||||
lex_added = len(nlp.vocab)
|
||||
msg.good(
|
||||
|
@ -160,7 +162,7 @@ def create_model(lang, lex_attrs):
|
|||
return nlp
|
||||
|
||||
|
||||
def add_vectors(nlp, vectors_loc, prune_vectors):
|
||||
def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||
|
@ -181,7 +183,10 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
|
|||
lexeme.is_oov = False
|
||||
if vectors_data is not None:
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
|
||||
if name is None:
|
||||
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
|
||||
else:
|
||||
nlp.vocab.vectors.name = name
|
||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||
if prune_vectors >= 1:
|
||||
nlp.vocab.prune_vectors(prune_vectors)
|
||||
|
|
|
@ -538,6 +538,7 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
|||
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
|
||||
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
||||
| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
||||
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
||||
|
||||
## Evaluate {#evaluate new="2"}
|
||||
|
|
Loading…
Reference in New Issue
Block a user