mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* bring back default build_text_classifier method * remove _set_dims_ hack in favor of proper dim inference * add tok2vec initialize to unit test * small fixes * add unit test for various textcat config settings * logistic output layer does not have nO * fix window_size setting * proper fix * fix W initialization * Update textcat training example * Use ml_datasets * Convert training data to `Example` format * Use `n_texts` to set proportionate dev size * fix _init renaming on latest thinc * avoid setting a non-existing dim * update to thinc==8.0.0a2 * add BOW and CNN defaults for easy testing * various experiments with train_textcat script, fix softmax activation in textcat bow * allow textcat train script to work on other datasets as well * have dataset as a parameter * train textcat from config, with example config * add config for training textcat * formatting * fix exclusive_classes * fixing BOW for GPU * bump thinc to 8.0.0a3 (not published yet so CI will fail) * add in link_vectors_to_models which got deleted Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
		
			
				
	
	
		
			37 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			37 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import numpy
 | |
| from thinc.api import Model
 | |
| 
 | |
| from ..attrs import LOWER
 | |
| 
 | |
| 
 | |
| def extract_ngrams(ngram_size, attr=LOWER) -> Model:
 | |
|     model = Model("extract_ngrams", forward)
 | |
|     model.attrs["ngram_size"] = ngram_size
 | |
|     model.attrs["attr"] = attr
 | |
|     return model
 | |
| 
 | |
| 
 | |
| def forward(model, docs, is_train: bool):
 | |
|     batch_keys = []
 | |
|     batch_vals = []
 | |
|     for doc in docs:
 | |
|         unigrams = model.ops.asarray(doc.to_array([model.attrs["attr"]]))
 | |
|         ngrams = [unigrams]
 | |
|         for n in range(2, model.attrs["ngram_size"] + 1):
 | |
|             ngrams.append(model.ops.ngrams(n, unigrams))
 | |
|         keys = model.ops.xp.concatenate(ngrams)
 | |
|         keys, vals = model.ops.xp.unique(keys, return_counts=True)
 | |
|         batch_keys.append(keys)
 | |
|         batch_vals.append(vals)
 | |
|     # The dtype here matches what thinc is expecting -- which differs per
 | |
|     # platform (by int definition). This should be fixed once the problem
 | |
|     # is fixed on Thinc's side.
 | |
|     lengths = model.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
 | |
|     batch_keys = model.ops.xp.concatenate(batch_keys)
 | |
|     batch_vals = model.ops.asarray(model.ops.xp.concatenate(batch_vals), dtype="f")
 | |
| 
 | |
|     def backprop(dY):
 | |
|         return []
 | |
| 
 | |
|     return (batch_keys, batch_vals, lengths), backprop
 |