mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			169 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			169 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
 | |
| import numpy
 | |
| from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
 | |
| from thinc.api import MultiSoftmax, list2array
 | |
| 
 | |
| if TYPE_CHECKING:
 | |
|     # This lets us add type hints for mypy etc. without causing circular imports
 | |
|     from ...vocab import Vocab  # noqa: F401
 | |
|     from ...tokens import Doc  # noqa: F401
 | |
| 
 | |
| 
 | |
| def build_multi_task_model(
 | |
|     tok2vec: Model,
 | |
|     maxout_pieces: int,
 | |
|     token_vector_width: int,
 | |
|     nO: Optional[int] = None,
 | |
| ) -> Model:
 | |
|     softmax = Softmax(nO=nO, nI=token_vector_width * 2)
 | |
|     model = chain(
 | |
|         tok2vec,
 | |
|         Maxout(
 | |
|             nO=token_vector_width * 2,
 | |
|             nI=token_vector_width,
 | |
|             nP=maxout_pieces,
 | |
|             dropout=0.0,
 | |
|         ),
 | |
|         LayerNorm(token_vector_width * 2),
 | |
|         softmax,
 | |
|     )
 | |
|     model.set_ref("tok2vec", tok2vec)
 | |
|     model.set_ref("output_layer", softmax)
 | |
|     return model
 | |
| 
 | |
| 
 | |
| def build_cloze_multi_task_model(
 | |
|     vocab: "Vocab",
 | |
|     tok2vec: Model,
 | |
|     maxout_pieces: int,
 | |
|     hidden_size: int,
 | |
|     nO: Optional[int] = None,
 | |
| ) -> Model:
 | |
|     # nO = vocab.vectors.data.shape[1]
 | |
|     output_layer = chain(
 | |
|         list2array(),
 | |
|         Maxout(
 | |
|             nO=nO,
 | |
|             nI=tok2vec.get_dim("nO"),
 | |
|             nP=maxout_pieces,
 | |
|             normalize=True,
 | |
|             dropout=0.0,
 | |
|         ),
 | |
|         Linear(nO=nO, nI=nO, init_W=zero_init),
 | |
|     )
 | |
|     model = chain(tok2vec, output_layer)
 | |
|     model = build_masked_language_model(vocab, model)
 | |
|     model.set_ref("tok2vec", tok2vec)
 | |
|     model.set_ref("output_layer", output_layer)
 | |
|     return model
 | |
| 
 | |
| 
 | |
| def build_cloze_characters_multi_task_model(
 | |
|     vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int
 | |
| ) -> Model:
 | |
|     output_layer = chain(
 | |
|         list2array(),
 | |
|         Maxout(hidden_size, nP=maxout_pieces),
 | |
|         LayerNorm(nI=hidden_size),
 | |
|         MultiSoftmax([256] * nr_char, nI=hidden_size),
 | |
|     )
 | |
|     model = build_masked_language_model(vocab, chain(tok2vec, output_layer))
 | |
|     model.set_ref("tok2vec", tok2vec)
 | |
|     model.set_ref("output_layer", output_layer)
 | |
|     return model
 | |
| 
 | |
| 
 | |
| def build_masked_language_model(
 | |
|     vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15
 | |
| ) -> Model:
 | |
|     """Convert a model into a BERT-style masked language model"""
 | |
|     random_words = _RandomWords(vocab)
 | |
| 
 | |
|     def mlm_forward(model, docs, is_train):
 | |
|         mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
 | |
|         mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
 | |
|         output, backprop = model.layers[0](docs, is_train)
 | |
| 
 | |
|         def mlm_backward(d_output):
 | |
|             d_output *= 1 - mask
 | |
|             return backprop(d_output)
 | |
| 
 | |
|         return output, mlm_backward
 | |
| 
 | |
|     def mlm_initialize(model: Model, X=None, Y=None):
 | |
|         wrapped = model.layers[0]
 | |
|         wrapped.initialize(X=X, Y=Y)
 | |
|         for dim in wrapped.dim_names:
 | |
|             if wrapped.has_dim(dim):
 | |
|                 model.set_dim(dim, wrapped.get_dim(dim))
 | |
| 
 | |
|     mlm_model = Model(
 | |
|         "masked-language-model",
 | |
|         mlm_forward,
 | |
|         layers=[wrapped_model],
 | |
|         init=mlm_initialize,
 | |
|         refs={"wrapped": wrapped_model},
 | |
|         dims={dim: None for dim in wrapped_model.dim_names},
 | |
|     )
 | |
|     mlm_model.set_ref("wrapped", wrapped_model)
 | |
|     return mlm_model
 | |
| 
 | |
| 
 | |
| class _RandomWords:
 | |
|     def __init__(self, vocab: "Vocab") -> None:
 | |
|         self.words = [lex.text for lex in vocab if lex.prob != 0.0]
 | |
|         self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
 | |
|         self.words = self.words[:10000]
 | |
|         self.probs = self.probs[:10000]
 | |
|         self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
 | |
|         self.probs /= self.probs.sum()
 | |
|         self._cache = []
 | |
| 
 | |
|     def next(self) -> str:
 | |
|         if not self._cache:
 | |
|             self._cache.extend(
 | |
|                 numpy.random.choice(len(self.words), 10000, p=self.probs)
 | |
|             )
 | |
|         index = self._cache.pop()
 | |
|         return self.words[index]
 | |
| 
 | |
| 
 | |
| def _apply_mask(
 | |
|     docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15
 | |
| ) -> Tuple[numpy.ndarray, List["Doc"]]:
 | |
|     # This needs to be here to avoid circular imports
 | |
|     from ...tokens import Doc  # noqa: F811
 | |
| 
 | |
|     N = sum(len(doc) for doc in docs)
 | |
|     mask = numpy.random.uniform(0.0, 1.0, (N,))
 | |
|     mask = mask >= mask_prob
 | |
|     i = 0
 | |
|     masked_docs = []
 | |
|     for doc in docs:
 | |
|         words = []
 | |
|         for token in doc:
 | |
|             if not mask[i]:
 | |
|                 word = _replace_word(token.text, random_words)
 | |
|             else:
 | |
|                 word = token.text
 | |
|             words.append(word)
 | |
|             i += 1
 | |
|         spaces = [bool(w.whitespace_) for w in doc]
 | |
|         # NB: If you change this implementation to instead modify
 | |
|         # the docs in place, take care that the IDs reflect the original
 | |
|         # words. Currently we use the original docs to make the vectors
 | |
|         # for the target, so we don't lose the original tokens. But if
 | |
|         # you modified the docs in place here, you would.
 | |
|         masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
 | |
|     return mask, masked_docs
 | |
| 
 | |
| 
 | |
| def _replace_word(word: str, random_words: _RandomWords, mask: str = "[MASK]") -> str:
 | |
|     roll = numpy.random.random()
 | |
|     if roll < 0.8:
 | |
|         return mask
 | |
|     elif roll < 0.9:
 | |
|         return random_words.next()
 | |
|     else:
 | |
|         return word
 |