mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	* Support infinite generators for training corpora Support a training corpus with an infinite generator in the `spacy train` training loop: * Revert `create_train_batches` to the state where an infinite generator can be used as the in the first epoch of exactly one epoch without resulting in a memory leak (`max_epochs != 1` will still result in a memory leak) * Move the shuffling for the first epoch into the corpus reader, renaming it to `spacy.Corpus.v2`. * Switch to training option for shuffling in memory Training loop: * Add option `training.shuffle_train_corpus_in_memory` that controls whether the corpus is loaded in memory once and shuffled in the training loop * Revert changes to `create_train_batches` and rename to `create_train_batches_with_shuffling` for use with `spacy.Corpus.v1` and a corpus that should be loaded in memory * Add `create_train_batches_without_shuffling` for a corpus that should not be shuffled in the training loop: the corpus is merely batched during training Corpus readers: * Restore `spacy.Corpus.v1` * Add `spacy.ShuffledCorpus.v1` for a corpus shuffled in memory in the reader instead of the training loop * In combination with `shuffle_train_corpus_in_memory = False`, each epoch could result in a different augmentation * Refactor create_train_batches, validation * Rename config setting to `training.shuffle_train_corpus` * Refactor to use a single `create_train_batches` method with a `shuffle` option * Only validate `get_examples` in initialize step if: * labels are required * labels are not provided * Switch back to max_epochs=-1 for streaming train corpus * Use first 100 examples for stream train corpus init * Always check validate_get_examples in initialize
		
			
				
	
	
		
			277 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			277 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
 | |
| from thinc.api import Config, fix_random_seed, set_gpu_allocator
 | |
| from thinc.api import ConfigValidationError
 | |
| from pathlib import Path
 | |
| import srsly
 | |
| import numpy
 | |
| import tarfile
 | |
| import gzip
 | |
| import zipfile
 | |
| import tqdm
 | |
| from itertools import islice
 | |
| 
 | |
| from .pretrain import get_tok2vec_ref
 | |
| from ..lookups import Lookups
 | |
| from ..vectors import Vectors
 | |
| from ..errors import Errors, Warnings
 | |
| from ..schemas import ConfigSchemaTraining
 | |
| from ..util import registry, load_model_from_config, resolve_dot_names, logger
 | |
| from ..util import load_model, ensure_path, get_sourced_components
 | |
| from ..util import OOV_RANK, DEFAULT_OOV_PROB
 | |
| 
 | |
| if TYPE_CHECKING:
 | |
|     from ..language import Language  # noqa: F401
 | |
| 
 | |
| 
 | |
| def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
 | |
|     raw_config = config
 | |
|     config = raw_config.interpolate()
 | |
|     if "seed" not in config["training"]:
 | |
|         raise ValueError(Errors.E1015.format(value="[training] seed"))
 | |
|     if "gpu_allocator" not in config["training"]:
 | |
|         raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
 | |
|     if config["training"]["seed"] is not None:
 | |
|         fix_random_seed(config["training"]["seed"])
 | |
|     allocator = config["training"]["gpu_allocator"]
 | |
|     if use_gpu >= 0 and allocator:
 | |
|         set_gpu_allocator(allocator)
 | |
|     # Use original config here before it's resolved to functions
 | |
|     sourced = get_sourced_components(config)
 | |
|     nlp = load_model_from_config(raw_config, auto_fill=True)
 | |
|     logger.info("Set up nlp object from config")
 | |
|     config = nlp.config.interpolate()
 | |
|     # Resolve all training-relevant sections using the filled nlp config
 | |
|     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
 | |
|     dot_names = [T["train_corpus"], T["dev_corpus"]]
 | |
|     if not isinstance(T["train_corpus"], str):
 | |
|         raise ConfigValidationError(
 | |
|             desc=Errors.E897.format(
 | |
|                 field="training.train_corpus", type=type(T["train_corpus"])
 | |
|             )
 | |
|         )
 | |
|     if not isinstance(T["dev_corpus"], str):
 | |
|         raise ConfigValidationError(
 | |
|             desc=Errors.E897.format(
 | |
|                 field="training.dev_corpus", type=type(T["dev_corpus"])
 | |
|             )
 | |
|         )
 | |
|     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
 | |
|     optimizer = T["optimizer"]
 | |
|     # Components that shouldn't be updated during training
 | |
|     frozen_components = T["frozen_components"]
 | |
|     # Sourced components that require resume_training
 | |
|     resume_components = [p for p in sourced if p not in frozen_components]
 | |
|     logger.info(f"Pipeline: {nlp.pipe_names}")
 | |
|     if resume_components:
 | |
|         with nlp.select_pipes(enable=resume_components):
 | |
|             logger.info(f"Resuming training for: {resume_components}")
 | |
|             nlp.resume_training(sgd=optimizer)
 | |
|     # Make sure that listeners are defined before initializing further
 | |
|     nlp._link_components()
 | |
|     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
 | |
|         if T["max_epochs"] == -1:
 | |
|             logger.debug("Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels")
 | |
|             nlp.initialize(lambda: islice(train_corpus(nlp), 100), sgd=optimizer)
 | |
|         else:
 | |
|             nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
 | |
|         logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
 | |
|     # Detect components with listeners that are not frozen consistently
 | |
|     for name, proc in nlp.pipeline:
 | |
|         for listener in getattr(proc, "listening_components", []):  # e.g. tok2vec/transformer
 | |
|             # Don't warn about components not in the pipeline
 | |
|             if listener not in nlp.pipe_names:
 | |
|                 continue
 | |
| 
 | |
|             if listener in frozen_components and name not in frozen_components:
 | |
|                 logger.warning(Warnings.W087.format(name=name, listener=listener))
 | |
|             # We always check this regardless, in case user freezes tok2vec
 | |
|             if listener not in frozen_components and name in frozen_components:
 | |
|                 logger.warning(Warnings.W086.format(name=name, listener=listener))
 | |
|     return nlp
 | |
| 
 | |
| 
 | |
| def init_vocab(
 | |
|     nlp: "Language",
 | |
|     *,
 | |
|     data: Optional[Path] = None,
 | |
|     lookups: Optional[Lookups] = None,
 | |
|     vectors: Optional[str] = None,
 | |
| ) -> "Language":
 | |
|     if lookups:
 | |
|         nlp.vocab.lookups = lookups
 | |
|         logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
 | |
|     data_path = ensure_path(data)
 | |
|     if data_path is not None:
 | |
|         lex_attrs = srsly.read_jsonl(data_path)
 | |
|         for lexeme in nlp.vocab:
 | |
|             lexeme.rank = OOV_RANK
 | |
|         for attrs in lex_attrs:
 | |
|             if "settings" in attrs:
 | |
|                 continue
 | |
|             lexeme = nlp.vocab[attrs["orth"]]
 | |
|             lexeme.set_attrs(**attrs)
 | |
|         if len(nlp.vocab):
 | |
|             oov_prob = min(lex.prob for lex in nlp.vocab) - 1
 | |
|         else:
 | |
|             oov_prob = DEFAULT_OOV_PROB
 | |
|         nlp.vocab.cfg.update({"oov_prob": oov_prob})
 | |
|         logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
 | |
|     logger.info("Created vocabulary")
 | |
|     if vectors is not None:
 | |
|         load_vectors_into_model(nlp, vectors)
 | |
|         logger.info(f"Added vectors: {vectors}")
 | |
|     logger.info("Finished initializing nlp object")
 | |
| 
 | |
| 
 | |
| def load_vectors_into_model(
 | |
|     nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
 | |
| ) -> None:
 | |
|     """Load word vectors from an installed model or path into a model instance."""
 | |
|     try:
 | |
|         vectors_nlp = load_model(name)
 | |
|     except ConfigValidationError as e:
 | |
|         title = f"Config validation error for vectors {name}"
 | |
|         desc = (
 | |
|             "This typically means that there's a problem in the config.cfg included "
 | |
|             "with the packaged vectors. Make sure that the vectors package you're "
 | |
|             "loading is compatible with the current version of spaCy."
 | |
|         )
 | |
|         err = ConfigValidationError.from_error(e, title=title, desc=desc)
 | |
|         raise err from None
 | |
| 
 | |
|     if len(vectors_nlp.vocab.vectors.keys()) == 0:
 | |
|         logger.warning(Warnings.W112.format(name=name))
 | |
| 
 | |
|     nlp.vocab.vectors = vectors_nlp.vocab.vectors
 | |
|     if add_strings:
 | |
|         # I guess we should add the strings from the vectors_nlp model?
 | |
|         # E.g. if someone does a similarity query, they might expect the strings.
 | |
|         for key in nlp.vocab.vectors.key2row:
 | |
|             if key in vectors_nlp.vocab.strings:
 | |
|                 nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 | |
| 
 | |
| 
 | |
| def init_tok2vec(
 | |
|     nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
 | |
| ) -> bool:
 | |
|     # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
 | |
|     P = pretrain_config
 | |
|     I = init_config
 | |
|     weights_data = None
 | |
|     init_tok2vec = ensure_path(I["init_tok2vec"])
 | |
|     if init_tok2vec is not None:
 | |
|         if not init_tok2vec.exists():
 | |
|             err = f"can't find pretrained tok2vec: {init_tok2vec}"
 | |
|             errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
 | |
|             raise ConfigValidationError(config=nlp.config, errors=errors)
 | |
|         with init_tok2vec.open("rb") as file_:
 | |
|             weights_data = file_.read()
 | |
|     if weights_data is not None:
 | |
|         layer = get_tok2vec_ref(nlp, P)
 | |
|         layer.from_bytes(weights_data)
 | |
|         logger.info(f"Loaded pretrained weights from {init_tok2vec}")
 | |
|         return True
 | |
|     return False
 | |
| 
 | |
| 
 | |
| def convert_vectors(
 | |
|     nlp: "Language",
 | |
|     vectors_loc: Optional[Path],
 | |
|     *,
 | |
|     truncate: int,
 | |
|     prune: int,
 | |
|     name: Optional[str] = None,
 | |
| ) -> None:
 | |
|     vectors_loc = ensure_path(vectors_loc)
 | |
|     if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
 | |
|         nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
 | |
|         for lex in nlp.vocab:
 | |
|             if lex.rank and lex.rank != OOV_RANK:
 | |
|                 nlp.vocab.vectors.add(lex.orth, row=lex.rank)
 | |
|     else:
 | |
|         if vectors_loc:
 | |
|             logger.info(f"Reading vectors from {vectors_loc}")
 | |
|             vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
 | |
|             logger.info(f"Loaded vectors from {vectors_loc}")
 | |
|         else:
 | |
|             vectors_data, vector_keys = (None, None)
 | |
|         if vector_keys is not None:
 | |
|             for word in vector_keys:
 | |
|                 if word not in nlp.vocab:
 | |
|                     nlp.vocab[word]
 | |
|         if vectors_data is not None:
 | |
|             nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
 | |
|     if name is None:
 | |
|         # TODO: Is this correct? Does this matter?
 | |
|         nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
 | |
|     else:
 | |
|         nlp.vocab.vectors.name = name
 | |
|     nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
 | |
|     if prune >= 1:
 | |
|         nlp.vocab.prune_vectors(prune)
 | |
| 
 | |
| 
 | |
| def read_vectors(vectors_loc: Path, truncate_vectors: int):
 | |
|     f = ensure_shape(vectors_loc)
 | |
|     shape = tuple(int(size) for size in next(f).split())
 | |
|     if truncate_vectors >= 1:
 | |
|         shape = (truncate_vectors, shape[1])
 | |
|     vectors_data = numpy.zeros(shape=shape, dtype="f")
 | |
|     vectors_keys = []
 | |
|     for i, line in enumerate(tqdm.tqdm(f)):
 | |
|         line = line.rstrip()
 | |
|         pieces = line.rsplit(" ", vectors_data.shape[1])
 | |
|         word = pieces.pop(0)
 | |
|         if len(pieces) != vectors_data.shape[1]:
 | |
|             raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
 | |
|         vectors_data[i] = numpy.asarray(pieces, dtype="f")
 | |
|         vectors_keys.append(word)
 | |
|         if i == truncate_vectors - 1:
 | |
|             break
 | |
|     return vectors_data, vectors_keys
 | |
| 
 | |
| 
 | |
| def open_file(loc: Union[str, Path]) -> IO:
 | |
|     """Handle .gz, .tar.gz or unzipped files"""
 | |
|     loc = ensure_path(loc)
 | |
|     if tarfile.is_tarfile(str(loc)):
 | |
|         return tarfile.open(str(loc), "r:gz")
 | |
|     elif loc.parts[-1].endswith("gz"):
 | |
|         return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
 | |
|     elif loc.parts[-1].endswith("zip"):
 | |
|         zip_file = zipfile.ZipFile(str(loc))
 | |
|         names = zip_file.namelist()
 | |
|         file_ = zip_file.open(names[0])
 | |
|         return (line.decode("utf8") for line in file_)
 | |
|     else:
 | |
|         return loc.open("r", encoding="utf8")
 | |
| 
 | |
| 
 | |
| def ensure_shape(vectors_loc):
 | |
|     """Ensure that the first line of the data is the vectors shape.
 | |
|     If it's not, we read in the data and output the shape as the first result,
 | |
|     so that the reader doesn't have to deal with the problem.
 | |
|     """
 | |
|     lines = open_file(vectors_loc)
 | |
|     first_line = next(lines)
 | |
|     try:
 | |
|         shape = tuple(int(size) for size in first_line.split())
 | |
|     except ValueError:
 | |
|         shape = None
 | |
|     if shape is not None:
 | |
|         # All good, give the data
 | |
|         yield first_line
 | |
|         yield from lines
 | |
|     else:
 | |
|         # Figure out the shape, make it the first value, and then give the
 | |
|         # rest of the data.
 | |
|         width = len(first_line.split()) - 1
 | |
|         length = 1
 | |
|         for _ in lines:
 | |
|             length += 1
 | |
|         yield f"{length} {width}"
 | |
|         # Reading the lines in again from file. This to avoid having to
 | |
|         # store all the results in a list in memory
 | |
|         lines2 = open_file(vectors_loc)
 | |
|         yield from lines2
 |