mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	separate entity encoder to get 64D descriptions
This commit is contained in:
		
							parent
							
								
									fb37cdb2d3
								
							
						
					
					
						commit
						9abbd0899f
					
				
							
								
								
									
										113
									
								
								examples/pipeline/wiki_entity_linking/train_descriptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										113
									
								
								examples/pipeline/wiki_entity_linking/train_descriptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,113 @@ | ||||||
|  | from random import shuffle | ||||||
|  | 
 | ||||||
|  | from examples.pipeline.wiki_entity_linking import kb_creator | ||||||
|  | 
 | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | from spacy._ml import zero_init, create_default_optimizer | ||||||
|  | from spacy.cli.pretrain import get_cossim_loss | ||||||
|  | 
 | ||||||
|  | from thinc.v2v import Model | ||||||
|  | from thinc.api import chain | ||||||
|  | from thinc.neural._classes.affine import Affine | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class EntityEncoder: | ||||||
|  | 
 | ||||||
|  |     INPUT_DIM = 300  # dimension of pre-trained vectors | ||||||
|  |     DESC_WIDTH = 64 | ||||||
|  | 
 | ||||||
|  |     DROP = 0 | ||||||
|  |     EPOCHS = 5 | ||||||
|  |     STOP_THRESHOLD = 0.05 | ||||||
|  | 
 | ||||||
|  |     BATCH_SIZE = 1000 | ||||||
|  | 
 | ||||||
|  |     def __init__(self, kb, nlp): | ||||||
|  |         self.nlp = nlp | ||||||
|  |         self.kb = kb | ||||||
|  | 
 | ||||||
|  |     def run(self, entity_descr_output): | ||||||
|  |         id_to_descr = kb_creator._get_id_to_description(entity_descr_output) | ||||||
|  | 
 | ||||||
|  |         processed, loss = self._train_model(entity_descr_output, id_to_descr) | ||||||
|  |         print("Trained on", processed, "entities across", self.EPOCHS, "epochs") | ||||||
|  |         print("Final loss:", loss) | ||||||
|  |         print() | ||||||
|  | 
 | ||||||
|  |         # TODO: apply and write to file afterwards ! | ||||||
|  |         # self._apply_encoder(id_to_descr) | ||||||
|  | 
 | ||||||
|  |     def _train_model(self, entity_descr_output, id_to_descr): | ||||||
|  |         # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy | ||||||
|  | 
 | ||||||
|  |         self._build_network(self.INPUT_DIM, self.DESC_WIDTH) | ||||||
|  | 
 | ||||||
|  |         processed = 0 | ||||||
|  |         loss = 1 | ||||||
|  | 
 | ||||||
|  |         for i in range(self.EPOCHS): | ||||||
|  |             entity_keys = list(id_to_descr.keys()) | ||||||
|  |             shuffle(entity_keys) | ||||||
|  | 
 | ||||||
|  |             batch_nr = 0 | ||||||
|  |             start = 0 | ||||||
|  |             stop = min(self.BATCH_SIZE, len(entity_keys)) | ||||||
|  | 
 | ||||||
|  |             while loss > self.STOP_THRESHOLD and start < len(entity_keys): | ||||||
|  |                 batch = [] | ||||||
|  |                 for e in entity_keys[start:stop]: | ||||||
|  |                     descr = id_to_descr[e] | ||||||
|  |                     doc = self.nlp(descr) | ||||||
|  |                     doc_vector = self._get_doc_embedding(doc) | ||||||
|  |                     batch.append(doc_vector) | ||||||
|  | 
 | ||||||
|  |                 loss = self.update(batch) | ||||||
|  |                 print(i, batch_nr, loss) | ||||||
|  |                 processed += len(batch) | ||||||
|  | 
 | ||||||
|  |                 batch_nr += 1 | ||||||
|  |                 start = start + self.BATCH_SIZE | ||||||
|  |                 stop = min(stop + self.BATCH_SIZE, len(entity_keys)) | ||||||
|  | 
 | ||||||
|  |         return processed, loss | ||||||
|  | 
 | ||||||
|  |     def _apply_encoder(self, id_to_descr): | ||||||
|  |         for id, descr in id_to_descr.items(): | ||||||
|  |             doc = self.nlp(descr) | ||||||
|  |             doc_vector = self._get_doc_embedding(doc) | ||||||
|  |             encoding = self.encoder(np.asarray([doc_vector])) | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def _get_doc_embedding(doc): | ||||||
|  |         indices = np.zeros((len(doc),), dtype="i") | ||||||
|  |         for i, word in enumerate(doc): | ||||||
|  |             if word.orth in doc.vocab.vectors.key2row: | ||||||
|  |                 indices[i] = doc.vocab.vectors.key2row[word.orth] | ||||||
|  |             else: | ||||||
|  |                 indices[i] = 0 | ||||||
|  |         word_vectors = doc.vocab.vectors.data[indices] | ||||||
|  |         doc_vector = np.mean(word_vectors, axis=0)  # TODO: min? max? | ||||||
|  |         return doc_vector | ||||||
|  | 
 | ||||||
|  |     def _build_network(self, orig_width, hidden_with): | ||||||
|  |         with Model.define_operators({">>": chain}): | ||||||
|  |             self.encoder = ( | ||||||
|  |                 Affine(hidden_with, orig_width) | ||||||
|  |             ) | ||||||
|  |             self.model = self.encoder >> zero_init(Affine(orig_width, hidden_with, drop_factor=0.0)) | ||||||
|  | 
 | ||||||
|  |         self.sgd = create_default_optimizer(self.model.ops) | ||||||
|  | 
 | ||||||
|  |     def update(self, vectors): | ||||||
|  |         predictions, bp_model = self.model.begin_update(np.asarray(vectors), drop=self.DROP) | ||||||
|  | 
 | ||||||
|  |         loss, d_scores = self.get_loss(scores=predictions, golds=np.asarray(vectors)) | ||||||
|  |         bp_model(d_scores, sgd=self.sgd) | ||||||
|  | 
 | ||||||
|  |         return loss / len(vectors) | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def get_loss(golds, scores): | ||||||
|  |         loss, gradients = get_cossim_loss(scores, golds) | ||||||
|  |         return loss, gradients | ||||||
|  | @ -31,7 +31,7 @@ class EL_Model: | ||||||
|     PRINT_BATCH_LOSS = False |     PRINT_BATCH_LOSS = False | ||||||
|     EPS = 0.0000000005 |     EPS = 0.0000000005 | ||||||
| 
 | 
 | ||||||
|     BATCH_SIZE = 5 |     BATCH_SIZE = 100 | ||||||
| 
 | 
 | ||||||
|     DOC_CUTOFF = 300    # number of characters from the doc context |     DOC_CUTOFF = 300    # number of characters from the doc context | ||||||
|     INPUT_DIM = 300     # dimension of pre-trained vectors |     INPUT_DIM = 300     # dimension of pre-trained vectors | ||||||
|  | @ -41,9 +41,9 @@ class EL_Model: | ||||||
|     ARTICLE_WIDTH = 128 |     ARTICLE_WIDTH = 128 | ||||||
|     SENT_WIDTH = 64 |     SENT_WIDTH = 64 | ||||||
| 
 | 
 | ||||||
|     DROP = 0.1 |     DROP = 0.4 | ||||||
|     LEARN_RATE = 0.001 |     LEARN_RATE = 0.005 | ||||||
|     EPOCHS = 5 |     EPOCHS = 10 | ||||||
|     L2 = 1e-6 |     L2 = 1e-6 | ||||||
| 
 | 
 | ||||||
|     name = "entity_linker" |     name = "entity_linker" | ||||||
|  | @ -62,12 +62,14 @@ class EL_Model: | ||||||
|     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): |     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): | ||||||
|         np.seterr(divide="raise", over="warn", under="ignore", invalid="raise") |         np.seterr(divide="raise", over="warn", under="ignore", invalid="raise") | ||||||
| 
 | 
 | ||||||
|  |         id_to_descr = kb_creator._get_id_to_description(entity_descr_output) | ||||||
|  | 
 | ||||||
|         train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \ |         train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \ | ||||||
|             self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False) |             self._get_training_data(training_dir, id_to_descr, False, trainlimit, to_print=False) | ||||||
|         train_clusters = list(train_ent.keys()) |         train_clusters = list(train_ent.keys()) | ||||||
| 
 | 
 | ||||||
|         dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \ |         dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \ | ||||||
|             self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False) |             self._get_training_data(training_dir, id_to_descr, True, devlimit, to_print=False) | ||||||
|         dev_clusters = list(dev_ent.keys()) |         dev_clusters = list(dev_ent.keys()) | ||||||
| 
 | 
 | ||||||
|         dev_pos_count = len([g for g in dev_gold.values() if g]) |         dev_pos_count = len([g for g in dev_gold.values() if g]) | ||||||
|  | @ -386,9 +388,7 @@ class EL_Model: | ||||||
|         bp_doc(doc_gradients, sgd=self.sgd_article) |         bp_doc(doc_gradients, sgd=self.sgd_article) | ||||||
|         bp_sent(sent_gradients, sgd=self.sgd_sent) |         bp_sent(sent_gradients, sgd=self.sgd_sent) | ||||||
| 
 | 
 | ||||||
|     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): |     def _get_training_data(self, training_dir, id_to_descr, dev, limit, to_print): | ||||||
|         id_to_descr = kb_creator._get_id_to_description(entity_descr_output) |  | ||||||
| 
 |  | ||||||
|         correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, |         correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, | ||||||
|                                                                                          collect_correct=True, |                                                                                          collect_correct=True, | ||||||
|                                                                                          collect_incorrect=True) |                                                                                          collect_incorrect=True) | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el | from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el | ||||||
|  | from examples.pipeline.wiki_entity_linking.train_descriptions import EntityEncoder | ||||||
| from examples.pipeline.wiki_entity_linking.train_el import EL_Model | from examples.pipeline.wiki_entity_linking.train_el import EL_Model | ||||||
| 
 | 
 | ||||||
| import spacy | import spacy | ||||||
|  | @ -38,11 +39,14 @@ if __name__ == "__main__": | ||||||
|     to_read_kb = True |     to_read_kb = True | ||||||
|     to_test_kb = False |     to_test_kb = False | ||||||
| 
 | 
 | ||||||
|  |     # run entity description pre-training | ||||||
|  |     run_desc_training = True | ||||||
|  | 
 | ||||||
|     # create training dataset |     # create training dataset | ||||||
|     create_wp_training = False |     create_wp_training = False | ||||||
| 
 | 
 | ||||||
|     # run training |     # run EL training | ||||||
|     run_training = True |     run_el_training = False | ||||||
| 
 | 
 | ||||||
|     # apply named entity linking to the dev dataset |     # apply named entity linking to the dev dataset | ||||||
|     apply_to_dev = False |     apply_to_dev = False | ||||||
|  | @ -101,17 +105,25 @@ if __name__ == "__main__": | ||||||
|             run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp) |             run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp) | ||||||
|             print() |             print() | ||||||
| 
 | 
 | ||||||
|  |     # STEP 4b : read KB back in from file, create entity descriptions | ||||||
|  |     # TODO: write back to file | ||||||
|  |     if run_desc_training: | ||||||
|  |         print("STEP 4b: training entity descriptions", datetime.datetime.now()) | ||||||
|  |         my_nlp = spacy.load('en_core_web_md') | ||||||
|  |         EntityEncoder(my_kb, my_nlp).run(entity_descr_output=ENTITY_DESCR) | ||||||
|  |         print() | ||||||
|  | 
 | ||||||
|     # STEP 5: create a training dataset from WP |     # STEP 5: create a training dataset from WP | ||||||
|     if create_wp_training: |     if create_wp_training: | ||||||
|         print("STEP 5: create training dataset", datetime.datetime.now()) |         print("STEP 5: create training dataset", datetime.datetime.now()) | ||||||
|         training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) |         training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) | ||||||
| 
 | 
 | ||||||
|     # STEP 6: apply the EL algorithm on the training dataset |     # STEP 6: apply the EL algorithm on the training dataset | ||||||
|     if run_training: |     if run_el_training: | ||||||
|         print("STEP 6: training", datetime.datetime.now()) |         print("STEP 6: training", datetime.datetime.now()) | ||||||
|         my_nlp = spacy.load('en_core_web_md') |         my_nlp = spacy.load('en_core_web_md') | ||||||
|         trainer = EL_Model(kb=my_kb, nlp=my_nlp) |         trainer = EL_Model(kb=my_kb, nlp=my_nlp) | ||||||
|         trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=20) |         trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500) | ||||||
|         print() |         print() | ||||||
| 
 | 
 | ||||||
|     # STEP 7: apply the EL algorithm on the dev dataset |     # STEP 7: apply the EL algorithm on the dev dataset | ||||||
|  |  | ||||||
|  | @ -1177,6 +1177,8 @@ class EntityLinker(Pipe): | ||||||
| 
 | 
 | ||||||
|     def predict(self, docs): |     def predict(self, docs): | ||||||
|         self.require_model() |         self.require_model() | ||||||
|  |         final_entities = list() | ||||||
|  |         final_kb_ids = list() | ||||||
|         for i, article_doc in enumerate(docs): |         for i, article_doc in enumerate(docs): | ||||||
|             doc_encoding = self.article_encoder([article_doc]) |             doc_encoding = self.article_encoder([article_doc]) | ||||||
|             for ent in article_doc.ents: |             for ent in article_doc.ents: | ||||||
|  | @ -1188,23 +1190,27 @@ class EntityLinker(Pipe): | ||||||
| 
 | 
 | ||||||
|                 candidates = self.kb.get_candidates(ent.text) |                 candidates = self.kb.get_candidates(ent.text) | ||||||
|                 if candidates: |                 if candidates: | ||||||
|                     highest_sim = -5 |  | ||||||
|                     best_i = -1 |  | ||||||
|                     with self.use_avg_params: |                     with self.use_avg_params: | ||||||
|  |                         scores = list() | ||||||
|                         for c in candidates: |                         for c in candidates: | ||||||
|  |                             prior_prob = c.prior_prob | ||||||
|                             kb_id = c.entity_ |                             kb_id = c.entity_ | ||||||
|                             description = self.id_to_descr.get(kb_id) |                             description = self.id_to_descr.get(kb_id) | ||||||
|                             entity_encodings = self.entity_encoder([description])  # TODO: static entity vectors ? |                             entity_encodings = self.entity_encoder([description])  # TODO: static entity vectors ? | ||||||
|                             sim = cosine(entity_encodings, mention_enc_t) |                             sim = cosine(entity_encodings, mention_enc_t) | ||||||
|                             if sim >= highest_sim: |                             score = prior_prob + sim - (prior_prob*sim)  # TODO: weights ? | ||||||
|                                 best_i = i |                             scores.append(score) | ||||||
|                                 highest_sim = sim |  | ||||||
| 
 | 
 | ||||||
|                     # TODO best_candidate = max(candidates, key=lambda c: c.prior_prob) |                         best_index = scores.index(max(scores)) | ||||||
|  |                         best_candidate = candidates[best_index] | ||||||
|  |                         final_entities.append(ent) | ||||||
|  |                         final_kb_ids.append(best_candidate) | ||||||
|  | 
 | ||||||
|  |         return final_entities, final_kb_ids | ||||||
| 
 | 
 | ||||||
|     def set_annotations(self, docs, entities, kb_ids=None): |     def set_annotations(self, docs, entities, kb_ids=None): | ||||||
|         for token, kb_id in zip(entities, kb_ids): |         for entity, kb_id in zip(entities, kb_ids): | ||||||
|             token.ent_kb_id_ = kb_id |             entity.ent_kb_id_ = kb_id | ||||||
| 
 | 
 | ||||||
| class Sentencizer(object): | class Sentencizer(object): | ||||||
|     """Segment the Doc into sentences using a rule-based strategy. |     """Segment the Doc into sentences using a rule-based strategy. | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user