mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 13:41:21 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			392 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			392 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import print_function, unicode_literals
 | |
| 
 | |
| import plac
 | |
| import random
 | |
| import numpy
 | |
| import time
 | |
| import re
 | |
| from collections import Counter
 | |
| from pathlib import Path
 | |
| from thinc.v2v import Affine, Maxout
 | |
| from thinc.misc import LayerNorm as LN
 | |
| from thinc.neural.util import prefer_gpu, get_array_module
 | |
| from wasabi import Printer
 | |
| import srsly
 | |
| 
 | |
| from ..errors import Errors
 | |
| from ..tokens import Doc
 | |
| from ..attrs import ID, HEAD
 | |
| from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
 | |
| from .._ml import masked_language_model
 | |
| from .. import util
 | |
| from .train import _load_pretrained_tok2vec
 | |
| 
 | |
| 
 | |
| @plac.annotations(
 | |
|     texts_loc=(
 | |
|         "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
 | |
|         "key 'tokens'",
 | |
|         "positional",
 | |
|         None,
 | |
|         str,
 | |
|     ),
 | |
|     vectors_model=("Name or path to spaCy model with vectors to learn from"),
 | |
|     output_dir=("Directory to write models to on each epoch", "positional", None, str),
 | |
|     width=("Width of CNN layers", "option", "cw", int),
 | |
|     depth=("Depth of CNN layers", "option", "cd", int),
 | |
|     embed_rows=("Number of embedding rows", "option", "er", int),
 | |
|     loss_func=(
 | |
|         "Loss function to use for the objective. Either 'L2' or 'cosine'",
 | |
|         "option",
 | |
|         "L",
 | |
|         str,
 | |
|     ),
 | |
|     use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
 | |
|     dropout=("Dropout rate", "option", "d", float),
 | |
|     batch_size=("Number of words per training batch", "option", "bs", int),
 | |
|     max_length=(
 | |
|         "Max words per example. Longer examples are discarded",
 | |
|         "option",
 | |
|         "xw",
 | |
|         int,
 | |
|     ),
 | |
|     min_length=(
 | |
|         "Min words per example. Shorter examples are discarded",
 | |
|         "option",
 | |
|         "nw",
 | |
|         int,
 | |
|     ),
 | |
|     seed=("Seed for random number generators", "option", "s", int),
 | |
|     n_iter=("Number of iterations to pretrain", "option", "i", int),
 | |
|     n_save_every=("Save model every X batches.", "option", "se", int),
 | |
|     init_tok2vec=(
 | |
|         "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
 | |
|         "option",
 | |
|         "t2v",
 | |
|         Path,
 | |
|     ),
 | |
|     epoch_start=(
 | |
|         "The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
 | |
|         "renamed. Prevents unintended overwriting of existing weight files.",
 | |
|         "option",
 | |
|         "es",
 | |
|         int
 | |
|     ),
 | |
| )
 | |
| def pretrain(
 | |
|     texts_loc,
 | |
|     vectors_model,
 | |
|     output_dir,
 | |
|     width=96,
 | |
|     depth=4,
 | |
|     embed_rows=2000,
 | |
|     loss_func="cosine",
 | |
|     use_vectors=False,
 | |
|     dropout=0.2,
 | |
|     n_iter=1000,
 | |
|     batch_size=3000,
 | |
|     max_length=500,
 | |
|     min_length=5,
 | |
|     seed=0,
 | |
|     n_save_every=None,
 | |
|     init_tok2vec=None,
 | |
|     epoch_start=None,
 | |
| ):
 | |
|     """
 | |
|     Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
 | |
|     using an approximate language-modelling objective. Specifically, we load
 | |
|     pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
 | |
|     vectors which match the pre-trained ones. The weights are saved to a directory
 | |
|     after each epoch. You can then pass a path to one of these pre-trained weights
 | |
|     files to the 'spacy train' command.
 | |
| 
 | |
|     This technique may be especially helpful if you have little labelled data.
 | |
|     However, it's still quite experimental, so your mileage may vary.
 | |
| 
 | |
|     To load the weights back in during 'spacy train', you need to ensure
 | |
|     all settings are the same between pretraining and training. The API and
 | |
|     errors around this need some improvement.
 | |
|     """
 | |
|     config = dict(locals())
 | |
|     for key in config:
 | |
|         if isinstance(config[key], Path):
 | |
|             config[key] = str(config[key])
 | |
|     msg = Printer()
 | |
|     util.fix_random_seed(seed)
 | |
| 
 | |
|     has_gpu = prefer_gpu()
 | |
|     msg.info("Using GPU" if has_gpu else "Not using GPU")
 | |
| 
 | |
|     output_dir = Path(output_dir)
 | |
|     if not output_dir.exists():
 | |
|         output_dir.mkdir()
 | |
|         msg.good("Created output directory")
 | |
|     srsly.write_json(output_dir / "config.json", config)
 | |
|     msg.good("Saved settings to config.json")
 | |
| 
 | |
|     # Load texts from file or stdin
 | |
|     if texts_loc != "-":  # reading from a file
 | |
|         texts_loc = Path(texts_loc)
 | |
|         if not texts_loc.exists():
 | |
|             msg.fail("Input text file doesn't exist", texts_loc, exits=1)
 | |
|         with msg.loading("Loading input texts..."):
 | |
|             texts = list(srsly.read_jsonl(texts_loc))
 | |
|         if not texts:
 | |
|             msg.fail("Input file is empty", texts_loc, exits=1)
 | |
|         msg.good("Loaded input texts")
 | |
|         random.shuffle(texts)
 | |
|     else:  # reading from stdin
 | |
|         msg.text("Reading input text from stdin...")
 | |
|         texts = srsly.read_jsonl("-")
 | |
| 
 | |
|     with msg.loading("Loading model '{}'...".format(vectors_model)):
 | |
|         nlp = util.load_model(vectors_model)
 | |
|     msg.good("Loaded model '{}'".format(vectors_model))
 | |
|     pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
 | |
|     model = create_pretraining_model(
 | |
|         nlp,
 | |
|         Tok2Vec(
 | |
|             width,
 | |
|             embed_rows,
 | |
|             conv_depth=depth,
 | |
|             pretrained_vectors=pretrained_vectors,
 | |
|             bilstm_depth=0,  # Requires PyTorch. Experimental.
 | |
|             cnn_maxout_pieces=3,  # You can try setting this higher
 | |
|             subword_features=True,  # Set to False for Chinese etc
 | |
|         ),
 | |
|     )
 | |
|     # Load in pre-trained weights
 | |
|     if init_tok2vec is not None:
 | |
|         components = _load_pretrained_tok2vec(nlp, init_tok2vec)
 | |
|         msg.text("Loaded pretrained tok2vec for: {}".format(components))
 | |
|         # Parse the epoch number from the given weight file
 | |
|         model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
 | |
|         if model_name:
 | |
|             # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
 | |
|             epoch_start = int(model_name.group(0)[5:][:-4]) + 1
 | |
|         else:
 | |
|             if not epoch_start:
 | |
|                 msg.fail(
 | |
|                     "You have to use the '--epoch-start' argument when using a renamed weight file for "
 | |
|                     "'--init-tok2vec'", exits=True
 | |
|                 )
 | |
|             elif epoch_start < 0:
 | |
|                 msg.fail(
 | |
|                     "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" % epoch_start,
 | |
|                     exits=True
 | |
|                 )
 | |
|     else:
 | |
|         # Without '--init-tok2vec' the '--epoch-start' argument is ignored
 | |
|         epoch_start = 0
 | |
| 
 | |
|     optimizer = create_default_optimizer(model.ops)
 | |
|     tracker = ProgressTracker(frequency=10000)
 | |
|     msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
 | |
|     row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
 | |
|     msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 | |
| 
 | |
|     def _save_model(epoch, is_temp=False):
 | |
|         is_temp_str = ".temp" if is_temp else ""
 | |
|         with model.use_params(optimizer.averages):
 | |
|             with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
 | |
|                 "wb"
 | |
|             ) as file_:
 | |
|                 file_.write(model.tok2vec.to_bytes())
 | |
|             log = {
 | |
|                 "nr_word": tracker.nr_word,
 | |
|                 "loss": tracker.loss,
 | |
|                 "epoch_loss": tracker.epoch_loss,
 | |
|                 "epoch": epoch,
 | |
|             }
 | |
|             with (output_dir / "log.jsonl").open("a") as file_:
 | |
|                 file_.write(srsly.json_dumps(log) + "\n")
 | |
| 
 | |
|     skip_counter = 0
 | |
|     for epoch in range(epoch_start, n_iter + epoch_start):
 | |
|         for batch_id, batch in enumerate(
 | |
|             util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
 | |
|         ):
 | |
|             docs, count = make_docs(
 | |
|                 nlp,
 | |
|                 [text for (text, _) in batch],
 | |
|                 max_length=max_length,
 | |
|                 min_length=min_length,
 | |
|             )
 | |
|             skip_counter += count
 | |
|             loss = make_update(
 | |
|                 model, docs, optimizer, objective=loss_func, drop=dropout
 | |
|             )
 | |
|             progress = tracker.update(epoch, loss, docs)
 | |
|             if progress:
 | |
|                 msg.row(progress, **row_settings)
 | |
|                 if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
 | |
|                     break
 | |
|             if n_save_every and (batch_id % n_save_every == 0):
 | |
|                 _save_model(epoch, is_temp=True)
 | |
|         _save_model(epoch)
 | |
|         tracker.epoch_loss = 0.0
 | |
|         if texts_loc != "-":
 | |
|             # Reshuffle the texts if texts were loaded from a file
 | |
|             random.shuffle(texts)
 | |
|     if skip_counter > 0:
 | |
|         msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
 | |
|     msg.good("Successfully finished pretrain")
 | |
| 
 | |
| 
 | |
| def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
 | |
|     """Perform an update over a single batch of documents.
 | |
| 
 | |
|     docs (iterable): A batch of `Doc` objects.
 | |
|     drop (float): The droput rate.
 | |
|     optimizer (callable): An optimizer.
 | |
|     RETURNS loss: A float for the loss.
 | |
|     """
 | |
|     predictions, backprop = model.begin_update(docs, drop=drop)
 | |
|     loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
 | |
|     backprop(gradients, sgd=optimizer)
 | |
|     # Don't want to return a cupy object here
 | |
|     # The gradients are modified in-place by the BERT MLM,
 | |
|     # so we get an accurate loss
 | |
|     return float(loss)
 | |
| 
 | |
| 
 | |
| def make_docs(nlp, batch, min_length, max_length):
 | |
|     docs = []
 | |
|     skip_count = 0
 | |
|     for record in batch:
 | |
|         if not isinstance(record, dict):
 | |
|             raise TypeError(Errors.E137.format(type=type(record), line=record))
 | |
|         if "tokens" in record:
 | |
|             words = record["tokens"]
 | |
|             if not words:
 | |
|                 skip_count += 1
 | |
|                 continue
 | |
|             doc = Doc(nlp.vocab, words=words)
 | |
|         elif "text" in record:
 | |
|             text = record["text"]
 | |
|             if not text:
 | |
|                 skip_count += 1
 | |
|                 continue
 | |
|             doc = nlp.make_doc(text)
 | |
|         else:
 | |
|             raise ValueError(Errors.E138.format(text=record))
 | |
|         if "heads" in record:
 | |
|             heads = record["heads"]
 | |
|             heads = numpy.asarray(heads, dtype="uint64")
 | |
|             heads = heads.reshape((len(doc), 1))
 | |
|             doc = doc.from_array([HEAD], heads)
 | |
|         if len(doc) >= min_length and len(doc) < max_length:
 | |
|             docs.append(doc)
 | |
|     return docs, skip_count
 | |
| 
 | |
| 
 | |
| def get_vectors_loss(ops, docs, prediction, objective="L2"):
 | |
|     """Compute a mean-squared error loss between the documents' vectors and
 | |
|     the prediction.
 | |
| 
 | |
|     Note that this is ripe for customization! We could compute the vectors
 | |
|     in some other word, e.g. with an LSTM language model, or use some other
 | |
|     type of objective.
 | |
|     """
 | |
|     # The simplest way to implement this would be to vstack the
 | |
|     # token.vector values, but that's a bit inefficient, especially on GPU.
 | |
|     # Instead we fetch the index into the vectors table for each of our tokens,
 | |
|     # and look them up all at once. This prevents data copying.
 | |
|     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
 | |
|     target = docs[0].vocab.vectors.data[ids]
 | |
|     if objective == "L2":
 | |
|         d_target = prediction - target
 | |
|         loss = (d_target ** 2).sum()
 | |
|     elif objective == "cosine":
 | |
|         loss, d_target = get_cossim_loss(prediction, target)
 | |
|     else:
 | |
|         raise ValueError(Errors.E142.format(loss_func=objective))
 | |
|     return loss, d_target
 | |
| 
 | |
| 
 | |
| def get_cossim_loss(yh, y):
 | |
|     # Add a small constant to avoid 0 vectors
 | |
|     yh = yh + 1e-8
 | |
|     y = y + 1e-8
 | |
|     # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
 | |
|     xp = get_array_module(yh)
 | |
|     norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
 | |
|     norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
 | |
|     mul_norms = norm_yh * norm_y
 | |
|     cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
 | |
|     d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
 | |
|     loss = xp.abs(cosine - 1).sum()
 | |
|     return loss, -d_yh
 | |
| 
 | |
| 
 | |
| def create_pretraining_model(nlp, tok2vec):
 | |
|     """Define a network for the pretraining. We simply add an output layer onto
 | |
|     the tok2vec input model. The tok2vec input model needs to be a model that
 | |
|     takes a batch of Doc objects (as a list), and returns a list of arrays.
 | |
|     Each array in the output needs to have one row per token in the doc.
 | |
|     """
 | |
|     output_size = nlp.vocab.vectors.data.shape[1]
 | |
|     output_layer = chain(
 | |
|         LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
 | |
|     )
 | |
|     # This is annoying, but the parser etc have the flatten step after
 | |
|     # the tok2vec. To load the weights in cleanly, we need to match
 | |
|     # the shape of the models' components exactly. So what we cann
 | |
|     # "tok2vec" has to be the same set of processes as what the components do.
 | |
|     tok2vec = chain(tok2vec, flatten)
 | |
|     model = chain(tok2vec, output_layer)
 | |
|     model = masked_language_model(nlp.vocab, model)
 | |
|     model.tok2vec = tok2vec
 | |
|     model.output_layer = output_layer
 | |
|     model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
 | |
|     return model
 | |
| 
 | |
| 
 | |
| class ProgressTracker(object):
 | |
|     def __init__(self, frequency=1000000):
 | |
|         self.loss = 0.0
 | |
|         self.prev_loss = 0.0
 | |
|         self.nr_word = 0
 | |
|         self.words_per_epoch = Counter()
 | |
|         self.frequency = frequency
 | |
|         self.last_time = time.time()
 | |
|         self.last_update = 0
 | |
|         self.epoch_loss = 0.0
 | |
| 
 | |
|     def update(self, epoch, loss, docs):
 | |
|         self.loss += loss
 | |
|         self.epoch_loss += loss
 | |
|         words_in_batch = sum(len(doc) for doc in docs)
 | |
|         self.words_per_epoch[epoch] += words_in_batch
 | |
|         self.nr_word += words_in_batch
 | |
|         words_since_update = self.nr_word - self.last_update
 | |
|         if words_since_update >= self.frequency:
 | |
|             wps = words_since_update / (time.time() - self.last_time)
 | |
|             self.last_update = self.nr_word
 | |
|             self.last_time = time.time()
 | |
|             loss_per_word = self.loss - self.prev_loss
 | |
|             status = (
 | |
|                 epoch,
 | |
|                 self.nr_word,
 | |
|                 _smart_round(self.loss, width=10),
 | |
|                 _smart_round(loss_per_word, width=6),
 | |
|                 int(wps),
 | |
|             )
 | |
|             self.prev_loss = float(self.loss)
 | |
|             return status
 | |
|         else:
 | |
|             return None
 | |
| 
 | |
| 
 | |
| def _smart_round(figure, width=10, max_decimal=4):
 | |
|     """Round large numbers as integers, smaller numbers as decimals."""
 | |
|     n_digits = len(str(int(figure)))
 | |
|     n_decimal = width - (n_digits + 1)
 | |
|     if n_decimal <= 1:
 | |
|         return str(int(figure))
 | |
|     else:
 | |
|         n_decimal = min(n_decimal, max_decimal)
 | |
|         format_str = "%." + str(n_decimal) + "f"
 | |
|         return format_str % figure
 |