mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* OrigAnnot class instead of gold.orig_annot list of zipped tuples * from_orig to replace from_annot_tuples * rename to RawAnnot * some unit tests for GoldParse creation and internal format * removing orig_annot and switching to lists instead of tuple * rewriting tuples to use RawAnnot (+ debug statements, WIP) * fix pop() changing the data * small fixes * pop-append fixes * return RawAnnot for existing GoldParse to have uniform interface * clean up imports * fix merge_sents * add unit test for 4402 with new structure (not working yet) * introduce DocAnnot * typo fixes * add unit test for merge_sents * rename from_orig to from_raw * fixing unit tests * fix nn parser * read_annots to produce text, doc_annot pairs * _make_golds fix * rename golds_to_gold_annots * small fixes * fix encoding * have golds_to_gold_annots use DocAnnot * missed a spot * merge_sents as function in DocAnnot * allow specifying only part of the token-level annotations * refactor with Example class + underlying dicts * pipeline components to work with Example objects (wip) * input checking * fix yielding * fix calls to update * small fixes * fix scorer unit test with new format * fix kwargs order * fixes for ud and conllu scripts * fix reading data for conllu script * add in proper errors (not fixed numbering yet to avoid merge conflicts) * fixing few more small bugs * fix EL script
		
			
				
	
	
		
			89 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			89 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """This example shows how to add a multi-task objective that is trained
 | |
| alongside the entity recognizer. This is an alternative to adding features
 | |
| to the model.
 | |
| 
 | |
| The multi-task idea is to train an auxiliary model to predict some attribute,
 | |
| with weights shared between the auxiliary model and the main model. In this
 | |
| example, we're predicting the position of the word in the document.
 | |
| 
 | |
| The model that predicts the position of the word encourages the convolutional
 | |
| layers to include the position information in their representation. The
 | |
| information is then available to the main model, as a feature.
 | |
| 
 | |
| The overall idea is that we might know something about what sort of features
 | |
| we'd like the CNN to extract. The multi-task objectives can encourage the
 | |
| extraction of this type of feature. The multi-task objective is only used
 | |
| during training. We discard the auxiliary model before run-time.
 | |
| 
 | |
| The specific example here is not necessarily a good idea --- but it shows
 | |
| how an arbitrary objective function for some word can be used.
 | |
| 
 | |
| Developed and tested for spaCy 2.0.6. Updated for v2.2.2
 | |
| """
 | |
| import random
 | |
| import plac
 | |
| import spacy
 | |
| import os.path
 | |
| from spacy.tokens import Doc
 | |
| from spacy.gold import read_json_file, GoldParse
 | |
| 
 | |
| random.seed(0)
 | |
| 
 | |
| PWD = os.path.dirname(__file__)
 | |
| 
 | |
| TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
 | |
| 
 | |
| 
 | |
| def get_position_label(i, token_annotation):
 | |
|     """Return labels indicating the position of the word in the document.
 | |
|     """
 | |
|     if len(token_annotation.words) < 20:
 | |
|         return "short-doc"
 | |
|     elif i == 0:
 | |
|         return "first-word"
 | |
|     elif i < 10:
 | |
|         return "early-word"
 | |
|     elif i < 20:
 | |
|         return "mid-word"
 | |
|     elif i == len(token_annotation.words) - 1:
 | |
|         return "last-word"
 | |
|     else:
 | |
|         return "late-word"
 | |
| 
 | |
| 
 | |
| def main(n_iter=10):
 | |
|     nlp = spacy.blank("en")
 | |
|     ner = nlp.create_pipe("ner")
 | |
|     ner.add_multitask_objective(get_position_label)
 | |
|     nlp.add_pipe(ner)
 | |
|     print(nlp.pipeline)
 | |
| 
 | |
|     print("Create data", len(TRAIN_DATA))
 | |
|     optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA)
 | |
|     for itn in range(n_iter):
 | |
|         random.shuffle(TRAIN_DATA)
 | |
|         losses = {}
 | |
|         for example in TRAIN_DATA:
 | |
|             for token_annotation in example.token_annotations:
 | |
|                 doc = Doc(nlp.vocab, words=token_annotation.words)
 | |
|                 gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation)
 | |
| 
 | |
|                 nlp.update(
 | |
|                     examples=[(doc, gold)],  # 1 example
 | |
|                     drop=0.2,  # dropout - make it harder to memorise data
 | |
|                     sgd=optimizer,  # callable to update weights
 | |
|                     losses=losses,
 | |
|                 )
 | |
|         print(losses.get("nn_labeller", 0.0), losses["ner"])
 | |
| 
 | |
|     # test the trained model
 | |
|     for example in TRAIN_DATA:
 | |
|         if example.text is not None:
 | |
|             doc = nlp(example.text)
 | |
|             print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
 | |
|             print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     plac.call(main)
 |