mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Allow vectors name to be set in init-model (#4321)
* Allow vectors name to be specified in init-model * Document --vectors-name argument to init-model * Update website/docs/api/cli.md Co-Authored-By: Ines Montani <ines@ines.io>
This commit is contained in:
parent
09816f8323
commit
92ed4dc5e0
|
@ -35,6 +35,7 @@ msg = Printer()
|
||||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||||
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
||||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||||
|
vectors_name=("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "vn", str)
|
||||||
)
|
)
|
||||||
def init_model(
|
def init_model(
|
||||||
lang,
|
lang,
|
||||||
|
@ -44,6 +45,7 @@ def init_model(
|
||||||
jsonl_loc=None,
|
jsonl_loc=None,
|
||||||
vectors_loc=None,
|
vectors_loc=None,
|
||||||
prune_vectors=-1,
|
prune_vectors=-1,
|
||||||
|
vectors_name=None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a new model from raw data, like word frequencies, Brown clusters
|
Create a new model from raw data, like word frequencies, Brown clusters
|
||||||
|
@ -78,7 +80,7 @@ def init_model(
|
||||||
nlp = create_model(lang, lex_attrs)
|
nlp = create_model(lang, lex_attrs)
|
||||||
msg.good("Successfully created model")
|
msg.good("Successfully created model")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
add_vectors(nlp, vectors_loc, prune_vectors)
|
add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
|
||||||
vec_added = len(nlp.vocab.vectors)
|
vec_added = len(nlp.vocab.vectors)
|
||||||
lex_added = len(nlp.vocab)
|
lex_added = len(nlp.vocab)
|
||||||
msg.good(
|
msg.good(
|
||||||
|
@ -160,7 +162,7 @@ def create_model(lang, lex_attrs):
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp, vectors_loc, prune_vectors):
|
def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||||
|
@ -181,7 +183,10 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
|
||||||
lexeme.is_oov = False
|
lexeme.is_oov = False
|
||||||
if vectors_data is not None:
|
if vectors_data is not None:
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||||
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
|
if name is None:
|
||||||
|
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
|
||||||
|
else:
|
||||||
|
nlp.vocab.vectors.name = name
|
||||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||||
if prune_vectors >= 1:
|
if prune_vectors >= 1:
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
nlp.vocab.prune_vectors(prune_vectors)
|
||||||
|
|
|
@ -538,6 +538,7 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
||||||
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
|
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
|
||||||
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
|
||||||
| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
|
||||||
|
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
|
||||||
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
|
||||||
|
|
||||||
## Evaluate {#evaluate new="2"}
|
## Evaluate {#evaluate new="2"}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user