mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
68 lines
2.8 KiB
Python
68 lines
2.8 KiB
Python
from typing import Optional, List, Dict, Any, Union, IO
|
|
import math
|
|
from tqdm import tqdm
|
|
import numpy
|
|
from ast import literal_eval
|
|
from pathlib import Path
|
|
from preshed.counter import PreshCounter
|
|
import tarfile
|
|
import gzip
|
|
import zipfile
|
|
import srsly
|
|
import warnings
|
|
from wasabi import msg, Printer
|
|
import typer
|
|
|
|
DEFAULT_OOV_PROB = -20
|
|
|
|
|
|
@init_cli.command("vocab")
|
|
@app.command(
|
|
"init-model",
|
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
hidden=True, # hide this from main CLI help but still allow it to work with warning
|
|
)
|
|
def init_model_cli(
|
|
# fmt: off
|
|
ctx: typer.Context, # This is only used to read additional arguments
|
|
lang: str = Arg(..., help="Pipeline language"),
|
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
|
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
|
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
|
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
|
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
|
|
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
|
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
|
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
|
model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
|
|
base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
|
|
# fmt: on
|
|
):
|
|
"""
|
|
Create a new blank pipeline directory with vocab and vectors from raw data.
|
|
If vectors are provided in Word2Vec format, they can be either a .txt or
|
|
zipped as a .zip or .tar.gz.
|
|
|
|
DOCS: https://nightly.spacy.io/api/cli#init-vocab
|
|
"""
|
|
if ctx.command.name == "init-model":
|
|
msg.warn(
|
|
"The init-model command is now called 'init vocab'. You can run "
|
|
"'python -m spacy init --help' for an overview of the other "
|
|
"available initialization commands."
|
|
)
|
|
init_vocab(
|
|
lang,
|
|
output_dir,
|
|
freqs_loc=freqs_loc,
|
|
clusters_loc=clusters_loc,
|
|
jsonl_loc=jsonl_loc,
|
|
vectors_loc=vectors_loc,
|
|
prune_vectors=prune_vectors,
|
|
truncate_vectors=truncate_vectors,
|
|
vectors_name=vectors_name,
|
|
model_name=model_name,
|
|
base_model=base_model,
|
|
silent=False,
|
|
)
|