mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-30 10:43:18 +03:00
Restore the 'jsonl' arg for init vectors
The lexemes.jsonl file is still used in our English vectors, and it may be required by users as well. I think it's worth supporting the option.
This commit is contained in:
parent
c379a4274a
commit
59294e91aa
|
@ -21,6 +21,7 @@ def init_vectors_cli(
|
||||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
@ -30,6 +31,13 @@ def init_vectors_cli(
|
||||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
nlp = util.get_lang_class(lang)()
|
nlp = util.get_lang_class(lang)()
|
||||||
|
if jsonl_loc is not None:
|
||||||
|
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||||
|
for attrs in lex_attrs:
|
||||||
|
if "settings" in attrs:
|
||||||
|
continue
|
||||||
|
lexeme = nlp.vocab[attrs["orth"]]
|
||||||
|
lexeme.set_attrs(**attrs)
|
||||||
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user