2020-06-21 22:35:01 +03:00
from typing import Optional , List , Dict , Any , Union , IO
2017-11-27 01:21:47 +03:00
import math
2019-12-16 15:12:19 +03:00
from tqdm import tqdm
2017-11-27 01:21:47 +03:00
import numpy
from ast import literal_eval
from pathlib import Path
from preshed . counter import PreshCounter
2018-03-21 16:33:23 +03:00
import tarfile
import gzip
2018-03-28 00:01:18 +03:00
import zipfile
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
import srsly
2020-04-28 15:00:11 +03:00
import warnings
2020-08-02 16:18:30 +03:00
from wasabi import msg , Printer
import typer
2017-11-27 01:21:47 +03:00
2019-08-01 18:26:09 +03:00
DEFAULT_OOV_PROB = - 20
2018-11-30 22:16:14 +03:00
2020-09-04 13:58:50 +03:00
@init_cli.command ( " vocab " )
2020-08-02 16:18:30 +03:00
@app.command (
" init-model " ,
context_settings = { " allow_extra_args " : True , " ignore_unknown_options " : True } ,
hidden = True , # hide this from main CLI help but still allow it to work with warning
)
2020-06-21 22:35:01 +03:00
def init_model_cli (
2020-01-01 15:15:46 +03:00
# fmt: off
2020-08-02 16:18:30 +03:00
ctx : typer . Context , # This is only used to read additional arguments
2020-09-03 14:13:03 +03:00
lang : str = Arg ( . . . , help = " Pipeline language " ) ,
output_dir : Path = Arg ( . . . , help = " Pipeline output directory " ) ,
2020-06-21 22:35:01 +03:00
freqs_loc : Optional [ Path ] = Arg ( None , help = " Location of words frequencies file " , exists = True ) ,
clusters_loc : Optional [ Path ] = Opt ( None , " --clusters-loc " , " -c " , help = " Optional location of brown clusters data " , exists = True ) ,
jsonl_loc : Optional [ Path ] = Opt ( None , " --jsonl-loc " , " -j " , help = " Location of JSONL-formatted attributes file " , exists = True ) ,
vectors_loc : Optional [ Path ] = Opt ( None , " --vectors-loc " , " -v " , help = " Optional vectors file in Word2Vec format " , exists = True ) ,
2020-07-01 22:00:47 +03:00
prune_vectors : int = Opt ( - 1 , " --prune-vectors " , " -V " , help = " Optional number of vectors to prune to " ) ,
2020-06-21 14:44:00 +03:00
truncate_vectors : int = Opt ( 0 , " --truncate-vectors " , " -t " , help = " Optional number of vectors to truncate to when reading in vectors file " ) ,
vectors_name : Optional [ str ] = Opt ( None , " --vectors-name " , " -vn " , help = " Optional name for the word vectors, e.g. en_core_web_lg.vectors " ) ,
2020-09-03 18:12:24 +03:00
model_name : Optional [ str ] = Opt ( None , " --meta-name " , " -mn " , help = " Optional name of the package for the pipeline meta " ) ,
base_model : Optional [ str ] = Opt ( None , " --base " , " -b " , help = " Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers) " )
2020-01-01 15:15:46 +03:00
# fmt: on
2018-11-30 22:16:14 +03:00
) :
2017-12-07 12:23:09 +03:00
"""
2020-09-03 14:13:03 +03:00
Create a new blank pipeline directory with vocab and vectors from raw data .
If vectors are provided in Word2Vec format , they can be either a . txt or
zipped as a . zip or . tar . gz .
2020-09-04 13:58:50 +03:00
DOCS : https : / / nightly . spacy . io / api / cli #init-vocab
2017-12-07 12:23:09 +03:00
"""
2020-08-02 16:18:30 +03:00
if ctx . command . name == " init-model " :
msg . warn (
2020-09-03 14:13:03 +03:00
" The init-model command is now called ' init vocab ' . You can run "
" ' python -m spacy init --help ' for an overview of the other "
" available initialization commands. "
2020-08-02 16:18:30 +03:00
)
2020-09-27 21:13:38 +03:00
init_vocab (
2020-06-21 22:35:01 +03:00
lang ,
output_dir ,
freqs_loc = freqs_loc ,
clusters_loc = clusters_loc ,
jsonl_loc = jsonl_loc ,
2020-07-01 22:00:47 +03:00
vectors_loc = vectors_loc ,
2020-06-21 22:35:01 +03:00
prune_vectors = prune_vectors ,
truncate_vectors = truncate_vectors ,
vectors_name = vectors_name ,
model_name = model_name ,
base_model = base_model ,
silent = False ,
)