spaCy/spacy/training/initialize.py

from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
from thinc.api import Config, fix_random_seed, set_gpu_allocator
from thinc.api import ConfigValidationError
from pathlib import Path
import srsly
import numpy
import tarfile
import gzip
import zipfile
import tqdm

from .pretrain import get_tok2vec_ref
from ..lookups import Lookups
from ..vectors import Vectors
from ..errors import Errors, Warnings
from ..schemas import ConfigSchemaTraining
from ..util import registry, load_model_from_config, resolve_dot_names, logger
from ..util import load_model, ensure_path, get_sourced_components
from ..util import OOV_RANK, DEFAULT_OOV_PROB

if TYPE_CHECKING:
    from ..language import Language  # noqa: F401


def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
    raw_config = config
    config = raw_config.interpolate()
    if "seed" not in config["training"]:
        raise ValueError(Errors.E1015.format(value="[training] seed"))
    if "gpu_allocator" not in config["training"]:
        raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    # Use original config here before it's resolved to functions
    sourced = get_sourced_components(config)
    nlp = load_model_from_config(raw_config, auto_fill=True)
    logger.info("Set up nlp object from config")
    config = nlp.config.interpolate()
    # Resolve all training-relevant sections using the filled nlp config
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    if not isinstance(T["train_corpus"], str):
        raise ConfigValidationError(
            desc=Errors.E897.format(
                field="training.train_corpus", type=type(T["train_corpus"])
            )
        )
    if not isinstance(T["dev_corpus"], str):
        raise ConfigValidationError(
            desc=Errors.E897.format(
                field="training.dev_corpus", type=type(T["dev_corpus"])
            )
        )
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
    optimizer = T["optimizer"]
    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
    # Sourced components that require resume_training
    resume_components = [p for p in sourced if p not in frozen_components]
    logger.info(f"Pipeline: {nlp.pipe_names}")
    if resume_components:
        with nlp.select_pipes(enable=resume_components):
            logger.info(f"Resuming training for: {resume_components}")
            nlp.resume_training(sgd=optimizer)
    # Make sure that listeners are defined before initializing further
    nlp._link_components()
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
    # Detect components with listeners that are not frozen consistently
    for name, proc in nlp.pipeline:
        if getattr(proc, "listening_components", None):  # e.g. tok2vec/transformer
            for listener in proc.listening_components:
                # Don't warn about components not in the pipeline
                if listener not in nlp.pipeline:
                    continue

                if listener in frozen_components and name not in frozen_components:
                    logger.warning(Warnings.W087.format(name=name, listener=listener))
                # We always check this regardless, in case user freezes tok2vec
                if listener not in frozen_components and name in frozen_components:
                    logger.warning(Warnings.W086.format(name=name, listener=listener))
    return nlp


def init_vocab(
    nlp: "Language",
    *,
    data: Optional[Path] = None,
    lookups: Optional[Lookups] = None,
    vectors: Optional[str] = None,
) -> "Language":
    if lookups:
        nlp.vocab.lookups = lookups
        logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
    data_path = ensure_path(data)
    if data_path is not None:
        lex_attrs = srsly.read_jsonl(data_path)
        for lexeme in nlp.vocab:
            lexeme.rank = OOV_RANK
        for attrs in lex_attrs:
            if "settings" in attrs:
                continue
            lexeme = nlp.vocab[attrs["orth"]]
            lexeme.set_attrs(**attrs)
        if len(nlp.vocab):
            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
        else:
            oov_prob = DEFAULT_OOV_PROB
        nlp.vocab.cfg.update({"oov_prob": oov_prob})
        logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
    logger.info("Created vocabulary")
    if vectors is not None:
        load_vectors_into_model(nlp, vectors)
        logger.info(f"Added vectors: {vectors}")
    logger.info("Finished initializing nlp object")


def load_vectors_into_model(
    nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
) -> None:
    """Load word vectors from an installed model or path into a model instance."""
    try:
        vectors_nlp = load_model(name)
    except ConfigValidationError as e:
        title = f"Config validation error for vectors {name}"
        desc = (
            "This typically means that there's a problem in the config.cfg included "
            "with the packaged vectors. Make sure that the vectors package you're "
            "loading is compatible with the current version of spaCy."
        )
        err = ConfigValidationError.from_error(e, title=title, desc=desc)
        raise err from None
    nlp.vocab.vectors = vectors_nlp.vocab.vectors
    if add_strings:
        # I guess we should add the strings from the vectors_nlp model?
        # E.g. if someone does a similarity query, they might expect the strings.
        for key in nlp.vocab.vectors.key2row:
            if key in vectors_nlp.vocab.strings:
                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])


def init_tok2vec(
    nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
) -> bool:
    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
    P = pretrain_config
    I = init_config
    weights_data = None
    init_tok2vec = ensure_path(I["init_tok2vec"])
    if init_tok2vec is not None:
        if not init_tok2vec.exists():
            err = f"can't find pretrained tok2vec: {init_tok2vec}"
            errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
            raise ConfigValidationError(config=nlp.config, errors=errors)
        with init_tok2vec.open("rb") as file_:
            weights_data = file_.read()
    if weights_data is not None:
        layer = get_tok2vec_ref(nlp, P)
        layer.from_bytes(weights_data)
        logger.info(f"Loaded pretrained weights from {init_tok2vec}")
        return True
    return False


def convert_vectors(
    nlp: "Language",
    vectors_loc: Optional[Path],
    *,
    truncate: int,
    prune: int,
    name: Optional[str] = None,
) -> None:
    vectors_loc = ensure_path(vectors_loc)
    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
        for lex in nlp.vocab:
            if lex.rank and lex.rank != OOV_RANK:
                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
    else:
        if vectors_loc:
            logger.info(f"Reading vectors from {vectors_loc}")
            vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
            logger.info(f"Loaded vectors from {vectors_loc}")
        else:
            vectors_data, vector_keys = (None, None)
        if vector_keys is not None:
            for word in vector_keys:
                if word not in nlp.vocab:
                    nlp.vocab[word]
        if vectors_data is not None:
            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
    if name is None:
        # TODO: Is this correct? Does this matter?
        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
    else:
        nlp.vocab.vectors.name = name
    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
    if prune >= 1:
        nlp.vocab.prune_vectors(prune)


def read_vectors(vectors_loc: Path, truncate_vectors: int):
    f = ensure_shape(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
    if truncate_vectors >= 1:
        shape = (truncate_vectors, shape[1])
    vectors_data = numpy.zeros(shape=shape, dtype="f")
    vectors_keys = []
    for i, line in enumerate(tqdm.tqdm(f)):
        line = line.rstrip()
        pieces = line.rsplit(" ", vectors_data.shape[1])
        word = pieces.pop(0)
        if len(pieces) != vectors_data.shape[1]:
            raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
        vectors_data[i] = numpy.asarray(pieces, dtype="f")
        vectors_keys.append(word)
        if i == truncate_vectors - 1:
            break
    return vectors_data, vectors_keys


def open_file(loc: Union[str, Path]) -> IO:
    """Handle .gz, .tar.gz or unzipped files"""
    loc = ensure_path(loc)
    if tarfile.is_tarfile(str(loc)):
        return tarfile.open(str(loc), "r:gz")
    elif loc.parts[-1].endswith("gz"):
        return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
    elif loc.parts[-1].endswith("zip"):
        zip_file = zipfile.ZipFile(str(loc))
        names = zip_file.namelist()
        file_ = zip_file.open(names[0])
        return (line.decode("utf8") for line in file_)
    else:
        return loc.open("r", encoding="utf8")


def ensure_shape(vectors_loc):
    """Ensure that the first line of the data is the vectors shape.
    If it's not, we read in the data and output the shape as the first result,
    so that the reader doesn't have to deal with the problem.
    """
    lines = open_file(vectors_loc)
    first_line = next(lines)
    try:
        shape = tuple(int(size) for size in first_line.split())
    except ValueError:
        shape = None
    if shape is not None:
        # All good, give the data
        yield first_line
        yield from lines
    else:
        # Figure out the shape, make it the first value, and then give the
        # rest of the data.
        width = len(first_line.split()) - 1
        length = 1
        for _ in lines:
            length += 1
        yield f"{length} {width}"
        # Reading the lines in again from file. This to avoid having to
        # store all the results in a list in memory
        lines2 = open_file(vectors_loc)
        yield from lines2
Add option to replace listeners for sourced components 2021-01-29 07:57:04 +03:00			`from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING`
Refactor CLI 2020-09-28 16:09:59 +03:00			`from thinc.api import Config, fix_random_seed, set_gpu_allocator`
			`from thinc.api import ConfigValidationError`
			`from pathlib import Path`
			`import srsly`
Add init vectors 2020-09-29 11:58:50 +03:00			`import numpy`
			`import tarfile`
			`import gzip`
			`import zipfile`
			`import tqdm`
Refactor CLI 2020-09-28 16:09:59 +03:00
Fixing pretrain (#7342) * initialize NLP with train corpus * add more pretraining tests * more tests * function to fetch tok2vec layer for pretraining * clarify parameter name * test different objectives * formatting * fix check for static vectors when using vectors objective * clarify docs * logger statement * fix init_tok2vec and proc.initialize order * test training after pretraining * add init_config tests for pretraining * pop pretraining block to avoid config validation errors * custom errors 2021-03-09 06:01:13 +03:00			`from .pretrain import get_tok2vec_ref`
Refactor CLI 2020-09-28 16:09:59 +03:00			`from ..lookups import Lookups`
Add init vectors 2020-09-29 11:58:50 +03:00			`from ..vectors import Vectors`
WIP: Various small training changes (#6818) * Allow output_path to be None during training * Fix cat scoring (?) * Improve error message for weighted None score * Improve messages So we can call this in other places etc. * FIx output path check * Use latest wasabi * Revert "Improve error message for weighted None score" This reverts commit 70599267635e2cfcc6c8922e3e4fb20dc978beb6. * Exclude None scores from final score by default It's otherwise very difficult to keep track of the score weights if we modify a config programmatically, source components etc. * Update warnings and use logger.warning 2021-01-26 06:51:52 +03:00			`from ..errors import Errors, Warnings`
Validate seed and gpu_allocator manually 2021-01-14 18:57:57 +03:00			`from ..schemas import ConfigSchemaTraining`
Fix logging 2020-09-29 17:08:39 +03:00			`from ..util import registry, load_model_from_config, resolve_dot_names, logger`
Move replacement logic to Language.from_config 2021-01-29 11:37:04 +03:00			`from ..util import load_model, ensure_path, get_sourced_components`
			`from ..util import OOV_RANK, DEFAULT_OOV_PROB`
Refactor CLI 2020-09-28 16:09:59 +03:00
Simplify config use in Language.initialize 2020-09-29 17:05:48 +03:00			`if TYPE_CHECKING:`
			`from ..language import Language # noqa: F401`
Refactor CLI 2020-09-28 16:09:59 +03:00
Simplify config use in Language.initialize 2020-09-29 17:05:48 +03:00
Fix logging 2020-09-29 17:08:39 +03:00			`def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":`
Refactor CLI 2020-09-28 16:09:59 +03:00			`raw_config = config`
			`config = raw_config.interpolate()`
Validate seed and gpu_allocator manually 2021-01-14 18:57:57 +03:00			`if "seed" not in config["training"]:`
			`raise ValueError(Errors.E1015.format(value="[training] seed"))`
			`if "gpu_allocator" not in config["training"]:`
			`raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))`
Refactor CLI 2020-09-28 16:09:59 +03:00			`if config["training"]["seed"] is not None:`
			`fix_random_seed(config["training"]["seed"])`
			`allocator = config["training"]["gpu_allocator"]`
			`if use_gpu >= 0 and allocator:`
			`set_gpu_allocator(allocator)`
			`# Use original config here before it's resolved to functions`
Add option to replace listeners for sourced components 2021-01-29 07:57:04 +03:00			`sourced = get_sourced_components(config)`
Refactor CLI 2020-09-28 16:09:59 +03:00			`nlp = load_model_from_config(raw_config, auto_fill=True)`
Fix logging 2020-09-29 17:08:39 +03:00			`logger.info("Set up nlp object from config")`
Refactor CLI 2020-09-28 16:09:59 +03:00			`config = nlp.config.interpolate()`
			`# Resolve all training-relevant sections using the filled nlp config`
			`T = registry.resolve(config["training"], schema=ConfigSchemaTraining)`
			`dot_names = [T["train_corpus"], T["dev_corpus"]]`
TextCat updates and fixes (#6263) * small fix in example imports * throw error when train_corpus or dev_corpus is not a string * small fix in custom logger example * limit macro_auc to labels with 2 annotations * fix typo * also create parents of output_dir if need be * update documentation of textcat scores * refactor TextCatEnsemble * fix tests for new AUC definition * bump to 3.0.0a42 * update docs * rename to spacy.TextCatEnsemble.v2 * spacy.TextCatEnsemble.v1 in legacy * cleanup * small fix * update to 3.0.0rc2 * fix import that got lost in merge * cursed IDE * fix two typos 2020-10-18 15:50:41 +03:00			`if not isinstance(T["train_corpus"], str):`
Tidy up and auto-format 2021-01-05 05:41:53 +03:00			`raise ConfigValidationError(`
			`desc=Errors.E897.format(`
			`field="training.train_corpus", type=type(T["train_corpus"])`
			`)`
			`)`
TextCat updates and fixes (#6263) * small fix in example imports * throw error when train_corpus or dev_corpus is not a string * small fix in custom logger example * limit macro_auc to labels with 2 annotations * fix typo * also create parents of output_dir if need be * update documentation of textcat scores * refactor TextCatEnsemble * fix tests for new AUC definition * bump to 3.0.0a42 * update docs * rename to spacy.TextCatEnsemble.v2 * spacy.TextCatEnsemble.v1 in legacy * cleanup * small fix * update to 3.0.0rc2 * fix import that got lost in merge * cursed IDE * fix two typos 2020-10-18 15:50:41 +03:00			`if not isinstance(T["dev_corpus"], str):`
Tidy up and auto-format 2021-01-05 05:41:53 +03:00			`raise ConfigValidationError(`
			`desc=Errors.E897.format(`
			`field="training.dev_corpus", type=type(T["dev_corpus"])`
			`)`
			`)`
Refactor CLI 2020-09-28 16:09:59 +03:00			`train_corpus, dev_corpus = resolve_dot_names(config, dot_names)`
			`optimizer = T["optimizer"]`
			`# Components that shouldn't be updated during training`
			`frozen_components = T["frozen_components"]`
			`# Sourced components that require resume_training`
Add option to replace listeners for sourced components 2021-01-29 07:57:04 +03:00			`resume_components = [p for p in sourced if p not in frozen_components]`
Fix logging 2020-09-29 17:08:39 +03:00			`logger.info(f"Pipeline: {nlp.pipe_names}")`
Refactor CLI 2020-09-28 16:09:59 +03:00			`if resume_components:`
			`with nlp.select_pipes(enable=resume_components):`
Fix logging 2020-09-29 17:08:39 +03:00			`logger.info(f"Resuming training for: {resume_components}")`
Refactor CLI 2020-09-28 16:09:59 +03:00			`nlp.resume_training(sgd=optimizer)`
Fix linking resumed components (#6859) * link components across enabled, resumed and frozen * revert renaming * revert renaming, the sequel 2021-02-01 14:19:58 +03:00			`# Make sure that listeners are defined before initializing further`
			`nlp._link_components()`
Refactor CLI 2020-09-28 16:09:59 +03:00			`with nlp.select_pipes(disable=[frozen_components, resume_components]):`
remove link_components flag again (#6883) 2021-02-02 05:08:40 +03:00			`nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)`
add initialize method for entity_ruler 2020-10-05 15:59:13 +03:00			`logger.info(f"Initialized pipeline components: {nlp.pipe_names}")`
warn when frozen components break listener pattern (#6766) * warn when frozen components break listener pattern * few notes in the documentation * update arg name * formatting * cleanup * specify listeners return type 2021-01-20 03:12:35 +03:00			`# Detect components with listeners that are not frozen consistently`
			`for name, proc in nlp.pipeline:`
Add option to replace listeners for sourced components 2021-01-29 07:57:04 +03:00			`if getattr(proc, "listening_components", None): # e.g. tok2vec/transformer`
warn when frozen components break listener pattern (#6766) * warn when frozen components break listener pattern * few notes in the documentation * update arg name * formatting * cleanup * specify listeners return type 2021-01-20 03:12:35 +03:00			`for listener in proc.listening_components:`
Don't warn about components not in the pipeline See here: https://github.com/explosion/spaCy/discussions/7463 Still need to check if there are any side effects of listeners being present but not in the pipeline, but this commit will silence the warnings. 2021-03-17 08:56:04 +03:00			`# Don't warn about components not in the pipeline`
			`if listener not in nlp.pipeline:`
			`continue`

Move replacement logic to Language.from_config 2021-01-29 11:37:04 +03:00			`if listener in frozen_components and name not in frozen_components:`
WIP: Various small training changes (#6818) * Allow output_path to be None during training * Fix cat scoring (?) * Improve error message for weighted None score * Improve messages So we can call this in other places etc. * FIx output path check * Use latest wasabi * Revert "Improve error message for weighted None score" This reverts commit 70599267635e2cfcc6c8922e3e4fb20dc978beb6. * Exclude None scores from final score by default It's otherwise very difficult to keep track of the score weights if we modify a config programmatically, source components etc. * Update warnings and use logger.warning 2021-01-26 06:51:52 +03:00			`logger.warning(Warnings.W087.format(name=name, listener=listener))`
Add option to replace listeners for sourced components 2021-01-29 07:57:04 +03:00			`# We always check this regardless, in case user freezes tok2vec`
warn when frozen components break listener pattern (#6766) * warn when frozen components break listener pattern * few notes in the documentation * update arg name * formatting * cleanup * specify listeners return type 2021-01-20 03:12:35 +03:00			`if listener not in frozen_components and name in frozen_components:`
WIP: Various small training changes (#6818) * Allow output_path to be None during training * Fix cat scoring (?) * Improve error message for weighted None score * Improve messages So we can call this in other places etc. * FIx output path check * Use latest wasabi * Revert "Improve error message for weighted None score" This reverts commit 70599267635e2cfcc6c8922e3e4fb20dc978beb6. * Exclude None scores from final score by default It's otherwise very difficult to keep track of the score weights if we modify a config programmatically, source components etc. * Update warnings and use logger.warning 2021-01-26 06:51:52 +03:00			`logger.warning(Warnings.W086.format(name=name, listener=listener))`
Refactor CLI 2020-09-28 16:09:59 +03:00			`return nlp`


			`def init_vocab(`
Simplify config use in Language.initialize 2020-09-29 17:05:48 +03:00			`nlp: "Language",`
Refactor CLI 2020-09-28 16:09:59 +03:00			`*,`
			`data: Optional[Path] = None,`
			`lookups: Optional[Lookups] = None,`
			`vectors: Optional[str] = None,`
Simplify config use in Language.initialize 2020-09-29 17:05:48 +03:00			`) -> "Language":`
Refactor CLI 2020-09-28 16:09:59 +03:00			`if lookups:`
			`nlp.vocab.lookups = lookups`
Fix logging 2020-09-29 17:08:39 +03:00			`logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")`
Refactor CLI 2020-09-28 16:09:59 +03:00			`data_path = ensure_path(data)`
			`if data_path is not None:`
			`lex_attrs = srsly.read_jsonl(data_path)`
			`for lexeme in nlp.vocab:`
			`lexeme.rank = OOV_RANK`
			`for attrs in lex_attrs:`
			`if "settings" in attrs:`
			`continue`
			`lexeme = nlp.vocab[attrs["orth"]]`
			`lexeme.set_attrs(**attrs)`
			`if len(nlp.vocab):`
			`oov_prob = min(lex.prob for lex in nlp.vocab) - 1`
			`else:`
			`oov_prob = DEFAULT_OOV_PROB`
			`nlp.vocab.cfg.update({"oov_prob": oov_prob})`
Fix logging 2020-09-29 17:22:41 +03:00			`logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")`
			`logger.info("Created vocabulary")`
Refactor CLI 2020-09-28 16:09:59 +03:00			`if vectors is not None:`
			`load_vectors_into_model(nlp, vectors)`
Fix logging 2020-09-29 17:22:41 +03:00			`logger.info(f"Added vectors: {vectors}")`
Add logging 2020-09-29 23:53:14 +03:00			`logger.info("Finished initializing nlp object")`
Refactor CLI 2020-09-28 16:09:59 +03:00

			`def load_vectors_into_model(`
Simplify config use in Language.initialize 2020-09-29 17:05:48 +03:00			`nlp: "Language", name: Union[str, Path], *, add_strings: bool = True`
Refactor CLI 2020-09-28 16:09:59 +03:00			`) -> None:`
			`"""Load word vectors from an installed model or path into a model instance."""`
			`try:`
			`vectors_nlp = load_model(name)`
			`except ConfigValidationError as e:`
			`title = f"Config validation error for vectors {name}"`
			`desc = (`
			`"This typically means that there's a problem in the config.cfg included "`
			`"with the packaged vectors. Make sure that the vectors package you're "`
			`"loading is compatible with the current version of spaCy."`
			`)`
Add specific error when StaticVectors can't read the vectors data (#6450) 2020-12-09 01:16:07 +03:00			`err = ConfigValidationError.from_error(e, title=title, desc=desc)`
Refactor CLI 2020-09-28 16:09:59 +03:00			`raise err from None`
			`nlp.vocab.vectors = vectors_nlp.vocab.vectors`
			`if add_strings:`
			`# I guess we should add the strings from the vectors_nlp model?`
			`# E.g. if someone does a similarity query, they might expect the strings.`
			`for key in nlp.vocab.vectors.key2row:`
			`if key in vectors_nlp.vocab.strings:`
			`nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])`


Simplify config use in Language.initialize 2020-09-29 17:05:48 +03:00			`def init_tok2vec(`
Tighten up format 2020-09-29 17:47:55 +03:00			`nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]`
Refactor CLI 2020-09-28 16:09:59 +03:00			`) -> bool:`
			`# Load pretrained tok2vec weights - cf. CLI command 'pretrain'`
			`P = pretrain_config`
Tighten up format 2020-09-29 17:47:55 +03:00			`I = init_config`
Refactor CLI 2020-09-28 16:09:59 +03:00			`weights_data = None`
Tighten up format 2020-09-29 17:47:55 +03:00			`init_tok2vec = ensure_path(I["init_tok2vec"])`
Refactor CLI 2020-09-28 16:09:59 +03:00			`if init_tok2vec is not None:`
			`if not init_tok2vec.exists():`
			`err = f"can't find pretrained tok2vec: {init_tok2vec}"`
Fix small issues, resolve_dot_names and debug model 2020-09-29 21:38:35 +03:00			`errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]`
Refactor CLI 2020-09-28 16:09:59 +03:00			`raise ConfigValidationError(config=nlp.config, errors=errors)`
			`with init_tok2vec.open("rb") as file_:`
			`weights_data = file_.read()`
			`if weights_data is not None:`
Fixing pretrain (#7342) * initialize NLP with train corpus * add more pretraining tests * more tests * function to fetch tok2vec layer for pretraining * clarify parameter name * test different objectives * formatting * fix check for static vectors when using vectors objective * clarify docs * logger statement * fix init_tok2vec and proc.initialize order * test training after pretraining * add init_config tests for pretraining * pop pretraining block to avoid config validation errors * custom errors 2021-03-09 06:01:13 +03:00			`layer = get_tok2vec_ref(nlp, P)`
Refactor CLI 2020-09-28 16:09:59 +03:00			`layer.from_bytes(weights_data)`
Fixing pretrain (#7342) * initialize NLP with train corpus * add more pretraining tests * more tests * function to fetch tok2vec layer for pretraining * clarify parameter name * test different objectives * formatting * fix check for static vectors when using vectors objective * clarify docs * logger statement * fix init_tok2vec and proc.initialize order * test training after pretraining * add init_config tests for pretraining * pop pretraining block to avoid config validation errors * custom errors 2021-03-09 06:01:13 +03:00			`logger.info(f"Loaded pretrained weights from {init_tok2vec}")`
Refactor CLI 2020-09-28 16:09:59 +03:00			`return True`
			`return False`


Add init vectors 2020-09-29 11:58:50 +03:00			`def convert_vectors(`
Simplify config use in Language.initialize 2020-09-29 17:05:48 +03:00			`nlp: "Language",`
Add init vectors 2020-09-29 11:58:50 +03:00			`vectors_loc: Optional[Path],`
			`*,`
			`truncate: int,`
			`prune: int,`
			`name: Optional[str] = None,`
			`) -> None:`
			`vectors_loc = ensure_path(vectors_loc)`
			`if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):`
			`nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))`
			`for lex in nlp.vocab:`
			`if lex.rank and lex.rank != OOV_RANK:`
			`nlp.vocab.vectors.add(lex.orth, row=lex.rank)`
			`else:`
			`if vectors_loc:`
Fix logging 2020-09-29 17:08:39 +03:00			`logger.info(f"Reading vectors from {vectors_loc}")`
			`vectors_data, vector_keys = read_vectors(vectors_loc, truncate)`
			`logger.info(f"Loaded vectors from {vectors_loc}")`
Add init vectors 2020-09-29 11:58:50 +03:00			`else:`
			`vectors_data, vector_keys = (None, None)`
			`if vector_keys is not None:`
			`for word in vector_keys:`
			`if word not in nlp.vocab:`
			`nlp.vocab[word]`
			`if vectors_data is not None:`
			`nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)`
			`if name is None:`
			`# TODO: Is this correct? Does this matter?`
			`nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"`
			`else:`
			`nlp.vocab.vectors.name = name`
			`nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name`
			`if prune >= 1:`
			`nlp.vocab.prune_vectors(prune)`


			`def read_vectors(vectors_loc: Path, truncate_vectors: int):`
reduce memory load when reading all vectors from file (#6945) * reduce memory load when reading all vectors from file * one more small typo fix 2021-02-07 03:05:43 +03:00			`f = ensure_shape(vectors_loc)`
Add init vectors 2020-09-29 11:58:50 +03:00			`shape = tuple(int(size) for size in next(f).split())`
			`if truncate_vectors >= 1:`
			`shape = (truncate_vectors, shape[1])`
			`vectors_data = numpy.zeros(shape=shape, dtype="f")`
			`vectors_keys = []`
			`for i, line in enumerate(tqdm.tqdm(f)):`
			`line = line.rstrip()`
			`pieces = line.rsplit(" ", vectors_data.shape[1])`
			`word = pieces.pop(0)`
			`if len(pieces) != vectors_data.shape[1]:`
			`raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))`
			`vectors_data[i] = numpy.asarray(pieces, dtype="f")`
			`vectors_keys.append(word)`
			`if i == truncate_vectors - 1:`
			`break`
			`return vectors_data, vectors_keys`


			`def open_file(loc: Union[str, Path]) -> IO:`
			`"""Handle .gz, .tar.gz or unzipped files"""`
			`loc = ensure_path(loc)`
			`if tarfile.is_tarfile(str(loc)):`
			`return tarfile.open(str(loc), "r:gz")`
			`elif loc.parts[-1].endswith("gz"):`
			`return (line.decode("utf8") for line in gzip.open(str(loc), "r"))`
			`elif loc.parts[-1].endswith("zip"):`
			`zip_file = zipfile.ZipFile(str(loc))`
			`names = zip_file.namelist()`
			`file_ = zip_file.open(names[0])`
			`return (line.decode("utf8") for line in file_)`
			`else:`
			`return loc.open("r", encoding="utf8")`


reduce memory load when reading all vectors from file (#6945) * reduce memory load when reading all vectors from file * one more small typo fix 2021-02-07 03:05:43 +03:00			`def ensure_shape(vectors_loc):`
Add init vectors 2020-09-29 11:58:50 +03:00			`"""Ensure that the first line of the data is the vectors shape.`
			`If it's not, we read in the data and output the shape as the first result,`
			`so that the reader doesn't have to deal with the problem.`
			`"""`
reduce memory load when reading all vectors from file (#6945) * reduce memory load when reading all vectors from file * one more small typo fix 2021-02-07 03:05:43 +03:00			`lines = open_file(vectors_loc)`
Add init vectors 2020-09-29 11:58:50 +03:00			`first_line = next(lines)`
			`try:`
			`shape = tuple(int(size) for size in first_line.split())`
			`except ValueError:`
			`shape = None`
			`if shape is not None:`
			`# All good, give the data`
			`yield first_line`
			`yield from lines`
			`else:`
			`# Figure out the shape, make it the first value, and then give the`
			`# rest of the data.`
			`width = len(first_line.split()) - 1`
reduce memory load when reading all vectors from file (#6945) * reduce memory load when reading all vectors from file * one more small typo fix 2021-02-07 03:05:43 +03:00			`length = 1`
			`for _ in lines:`
			`length += 1`
Add init vectors 2020-09-29 11:58:50 +03:00			`yield f"{length} {width}"`
reduce memory load when reading all vectors from file (#6945) * reduce memory load when reading all vectors from file * one more small typo fix 2021-02-07 03:05:43 +03:00			`# Reading the lines in again from file. This to avoid having to`
			`# store all the results in a list in memory`
			`lines2 = open_file(vectors_loc)`
			`yield from lines2`