mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	code cleanup
This commit is contained in:
		
							parent
							
								
									d8b435ceff
								
							
						
					
					
						commit
						61f0e2af65
					
				|  | @ -43,15 +43,10 @@ def create_kb(nlp, max_entities_per_alias, min_occ, | ||||||
|     title_list = title_list[0:34200] |     title_list = title_list[0:34200] | ||||||
|     title_to_id = {t: title_to_id[t] for t in title_list} |     title_to_id = {t: title_to_id[t] for t in title_list} | ||||||
| 
 | 
 | ||||||
|     # print("title_list", len(title_list), title_list[0:3]) |  | ||||||
| 
 |  | ||||||
|     entity_list = [title_to_id[x] for x in title_list] |     entity_list = [title_to_id[x] for x in title_list] | ||||||
|     # print("entity_list", len(entity_list), entity_list[0:3]) |  | ||||||
| 
 | 
 | ||||||
|     # TODO: should we remove entities from the KB where there is no description ? |     # Currently keeping entities from the KB where there is no description - putting a default void description | ||||||
|     description_list = [id_to_descr.get(x, "No description defined") for x in entity_list] |     description_list = [id_to_descr.get(x, "No description defined") for x in entity_list] | ||||||
|     # print("description_list", len(description_list), description_list[0:3]) |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
|     print() |     print() | ||||||
|     print("2. _get_entity_frequencies", datetime.datetime.now()) |     print("2. _get_entity_frequencies", datetime.datetime.now()) | ||||||
|  | @ -69,9 +64,6 @@ def create_kb(nlp, max_entities_per_alias, min_occ, | ||||||
|     print("4. get entity embeddings", datetime.datetime.now()) |     print("4. get entity embeddings", datetime.datetime.now()) | ||||||
|     print() |     print() | ||||||
|     embeddings = encoder.apply_encoder(description_list) |     embeddings = encoder.apply_encoder(description_list) | ||||||
|     # print("descriptions", description_list[0:3]) |  | ||||||
|     # print("embeddings", len(embeddings), embeddings[0:3]) |  | ||||||
|     #print("embeddings[0]", len(embeddings[0]), embeddings[0][0:3]) |  | ||||||
| 
 | 
 | ||||||
|     print() |     print() | ||||||
|     print("5. adding", len(entity_list), "entities", datetime.datetime.now()) |     print("5. adding", len(entity_list), "entities", datetime.datetime.now()) | ||||||
|  | @ -104,6 +96,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_ | ||||||
|         for qid, descr in id_to_descr.items(): |         for qid, descr in id_to_descr.items(): | ||||||
|             descr_file.write(str(qid) + "|" + descr + "\n") |             descr_file.write(str(qid) + "|" + descr + "\n") | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def _get_entity_to_id(entity_def_output): | def _get_entity_to_id(entity_def_output): | ||||||
|     entity_to_id = dict() |     entity_to_id = dict() | ||||||
|     with open(entity_def_output, 'r', encoding='utf8') as csvfile: |     with open(entity_def_output, 'r', encoding='utf8') as csvfile: | ||||||
|  | @ -135,7 +128,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in | ||||||
|         print("wp titles:", wp_titles) |         print("wp titles:", wp_titles) | ||||||
| 
 | 
 | ||||||
|     # adding aliases with prior probabilities |     # adding aliases with prior probabilities | ||||||
|         # we can read this file sequentially, it's sorted by alias, and then by count |     # we can read this file sequentially, it's sorted by alias, and then by count | ||||||
|     with open(prior_prob_input, mode='r', encoding='utf8') as prior_file: |     with open(prior_prob_input, mode='r', encoding='utf8') as prior_file: | ||||||
|         # skip header |         # skip header | ||||||
|         prior_file.readline() |         prior_file.readline() | ||||||
|  |  | ||||||
|  | @ -13,7 +13,7 @@ from examples.pipeline.wiki_entity_linking import training_set_creator | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def run_kb_toy_example(kb): | def run_kb_toy_example(kb): | ||||||
|     for mention in ("Bush", "President", "Homer"): |     for mention in ("Bush", "Douglas Adams", "Homer"): | ||||||
|         candidates = kb.get_candidates(mention) |         candidates = kb.get_candidates(mention) | ||||||
| 
 | 
 | ||||||
|         print("generating candidates for " + mention + " :") |         print("generating candidates for " + mention + " :") | ||||||
|  | @ -128,18 +128,12 @@ def evaluate(predictions, golds, to_print=True, times_hundred=True): | ||||||
|     return precision, recall, fscore, accuracy |     return precision, recall, fscore, accuracy | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _prepare_pipeline(nlp, kb): | 
 | ||||||
|     # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO |  | ||||||
|     el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) |  | ||||||
|     nlp.add_pipe(el_pipe, last=True) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # TODO | # TODO | ||||||
| def add_coref(): | def add_coref(nlp): | ||||||
|     """ Add coreference resolution to our model """ |     """ Add coreference resolution to our model """ | ||||||
|     nlp = spacy.load('en_core_web_sm') |  | ||||||
|     # nlp = spacy.load('en') |  | ||||||
| 
 |  | ||||||
|     # TODO: this doesn't work yet |     # TODO: this doesn't work yet | ||||||
|     # neuralcoref.add_to_pipe(nlp) |     # neuralcoref.add_to_pipe(nlp) | ||||||
|     print("done adding to pipe") |     print("done adding to pipe") | ||||||
|  |  | ||||||
|  | @ -18,7 +18,6 @@ ENTITY_FILE = "gold_entities.csv" | ||||||
| def create_training(kb, entity_def_input, training_output): | def create_training(kb, entity_def_input, training_output): | ||||||
|     if not kb: |     if not kb: | ||||||
|         raise ValueError("kb should be defined") |         raise ValueError("kb should be defined") | ||||||
|     # nlp = spacy.load('en_core_web_sm') |  | ||||||
|     wp_to_id = kb_creator._get_entity_to_id(entity_def_input) |     wp_to_id = kb_creator._get_entity_to_id(entity_def_input) | ||||||
|     _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000)  # TODO: full dataset |     _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000)  # TODO: full dataset | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -37,11 +37,13 @@ if __name__ == "__main__": | ||||||
| 
 | 
 | ||||||
|     # read KB back in from file |     # read KB back in from file | ||||||
|     to_read_kb = True |     to_read_kb = True | ||||||
|     to_test_kb = True |     to_test_kb = False | ||||||
| 
 | 
 | ||||||
|     # create training dataset |     # create training dataset | ||||||
|     create_wp_training = False |     create_wp_training = False | ||||||
| 
 | 
 | ||||||
|  |     train_pipe = True | ||||||
|  | 
 | ||||||
|     # run EL training |     # run EL training | ||||||
|     run_el_training = False |     run_el_training = False | ||||||
| 
 | 
 | ||||||
|  | @ -106,7 +108,15 @@ if __name__ == "__main__": | ||||||
|         print("STEP 5: create training dataset", datetime.datetime.now()) |         print("STEP 5: create training dataset", datetime.datetime.now()) | ||||||
|         training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) |         training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) | ||||||
| 
 | 
 | ||||||
|     # STEP 6: apply the EL algorithm on the training dataset |     # STEP 6: create the entity linking pipe | ||||||
|  |     if train_pipe: | ||||||
|  |         # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO | ||||||
|  |         el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb}) | ||||||
|  |         nlp.add_pipe(el_pipe, last=True) | ||||||
|  | 
 | ||||||
|  |     ### BELOW CODE IS DEPRECATED ### | ||||||
|  | 
 | ||||||
|  |     # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx | ||||||
|     if run_el_training: |     if run_el_training: | ||||||
|         print("STEP 6: training", datetime.datetime.now()) |         print("STEP 6: training", datetime.datetime.now()) | ||||||
|         trainer = EL_Model(kb=my_kb, nlp=nlp) |         trainer = EL_Model(kb=my_kb, nlp=nlp) | ||||||
|  |  | ||||||
|  | @ -1067,41 +1067,37 @@ cdef class EntityRecognizer(Parser): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class EntityLinker(Pipe): | class EntityLinker(Pipe): | ||||||
|  |     """Pipeline component for named entity linking. | ||||||
|  | 
 | ||||||
|  |     DOCS: TODO | ||||||
|  |     """ | ||||||
|     name = 'entity_linker' |     name = 'entity_linker' | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def Model(cls, **cfg): |     def Model(cls, **cfg): | ||||||
|         embed_width = cfg.get("embed_width", 300) |         embed_width = cfg.get("embed_width", 300) | ||||||
|         hidden_width = cfg.get("hidden_width", 32) |         hidden_width = cfg.get("hidden_width", 32) | ||||||
|         entity_width = cfg.get("entity_width", 64) |  | ||||||
|         article_width = cfg.get("article_width", 128) |         article_width = cfg.get("article_width", 128) | ||||||
|         sent_width = cfg.get("sent_width", 64) |         sent_width = cfg.get("sent_width", 64) | ||||||
| 
 |         entity_width = cfg["kb"].entity_vector_length | ||||||
|         entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width) |  | ||||||
| 
 | 
 | ||||||
|         article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width) |         article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width) | ||||||
|         sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width) |         sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width) | ||||||
| 
 | 
 | ||||||
|         # dimension of the mention encoder needs to match the dimension of the entity encoder |         # dimension of the mention encoder needs to match the dimension of the entity encoder | ||||||
|         mention_width = entity_encoder.nO |         mention_width = article_width + sent_width | ||||||
|         mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0) |         mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0) | ||||||
| 
 | 
 | ||||||
|         return entity_encoder, article_encoder, sent_encoder, mention_encoder |         return article_encoder, sent_encoder, mention_encoder | ||||||
| 
 | 
 | ||||||
|     def __init__(self, **cfg): |     def __init__(self, **cfg): | ||||||
|         # TODO: bring-your-own-model |  | ||||||
|         self.mention_encoder = True |         self.mention_encoder = True | ||||||
| 
 |  | ||||||
|         self.cfg = dict(cfg) |         self.cfg = dict(cfg) | ||||||
|         self.kb = self.cfg["kb"] |         self.kb = self.cfg["kb"] | ||||||
| 
 | 
 | ||||||
|         # TODO: fix this. store entity vectors in the KB ? |  | ||||||
|         self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv') |  | ||||||
| 
 |  | ||||||
|     def use_avg_params(self): |     def use_avg_params(self): | ||||||
|         """Modify the pipe's encoders/models, to use their average parameter values.""" |         """Modify the pipe's encoders/models, to use their average parameter values.""" | ||||||
|         with self.article_encoder.use_params(self.sgd_article.averages) \ |         with self.article_encoder.use_params(self.sgd_article.averages) \ | ||||||
|                  and self.entity_encoder.use_params(self.sgd_entity.averages)\ |  | ||||||
|                  and self.sent_encoder.use_params(self.sgd_sent.averages) \ |                  and self.sent_encoder.use_params(self.sgd_sent.averages) \ | ||||||
|                  and self.mention_encoder.use_params(self.sgd_mention.averages): |                  and self.mention_encoder.use_params(self.sgd_mention.averages): | ||||||
|             yield |             yield | ||||||
|  | @ -1113,14 +1109,13 @@ class EntityLinker(Pipe): | ||||||
| 
 | 
 | ||||||
|     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): |     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): | ||||||
|         if self.mention_encoder is True: |         if self.mention_encoder is True: | ||||||
|             self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) |             self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) | ||||||
|             self.sgd_article = create_default_optimizer(self.article_encoder.ops) |             self.sgd_article = create_default_optimizer(self.article_encoder.ops) | ||||||
|             self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) |             self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) | ||||||
|             self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) |             self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) | ||||||
|             self.sgd_entity = create_default_optimizer(self.entity_encoder.ops) |  | ||||||
| 
 | 
 | ||||||
|     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): |     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): | ||||||
|         """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) """ |         """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """ | ||||||
|         self.require_model() |         self.require_model() | ||||||
| 
 | 
 | ||||||
|         entity_docs, article_docs, sentence_docs = docs |         entity_docs, article_docs, sentence_docs = docs | ||||||
|  | @ -1131,7 +1126,7 @@ class EntityLinker(Pipe): | ||||||
|             article_docs = [article_docs] |             article_docs = [article_docs] | ||||||
|             sentence_docs = [sentence_docs] |             sentence_docs = [sentence_docs] | ||||||
| 
 | 
 | ||||||
|         entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop) |         entity_encodings = None #TODO | ||||||
|         doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) |         doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) | ||||||
|         sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) |         sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) | ||||||
| 
 | 
 | ||||||
|  | @ -1195,10 +1190,9 @@ class EntityLinker(Pipe): | ||||||
|                         for c in candidates: |                         for c in candidates: | ||||||
|                             prior_prob = c.prior_prob |                             prior_prob = c.prior_prob | ||||||
|                             kb_id = c.entity_ |                             kb_id = c.entity_ | ||||||
|                             description = self.id_to_descr.get(kb_id) |                             entity_encoding = c.entity_vector | ||||||
|                             entity_encodings = self.entity_encoder([description])  # TODO: static entity vectors ? |                             sim = cosine([entity_encoding], mention_enc_t) | ||||||
|                             sim = cosine(entity_encodings, mention_enc_t) |                             score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ? | ||||||
|                             score = prior_prob + sim - (prior_prob*sim)  # TODO: weights ? |  | ||||||
|                             scores.append(score) |                             scores.append(score) | ||||||
| 
 | 
 | ||||||
|                         best_index = scores.index(max(scores)) |                         best_index = scores.index(max(scores)) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user