mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	experiment with adding NER types to the feature vector
This commit is contained in:
		
							parent
							
								
									c664f58246
								
							
						
					
					
						commit
						2d2dea9924
					
				|  | @ -42,7 +42,7 @@ MIN_PAIR_OCC = 5 | ||||||
| 
 | 
 | ||||||
| # model training parameters | # model training parameters | ||||||
| EPOCHS = 10 | EPOCHS = 10 | ||||||
| DROPOUT = 0.2 | DROPOUT = 0.5 | ||||||
| LEARN_RATE = 0.005 | LEARN_RATE = 0.005 | ||||||
| L2 = 1e-6 | L2 = 1e-6 | ||||||
| CONTEXT_WIDTH = 128 | CONTEXT_WIDTH = 128 | ||||||
|  | @ -73,10 +73,10 @@ def run_pipeline(): | ||||||
|     measure_performance = True |     measure_performance = True | ||||||
| 
 | 
 | ||||||
|     # test the EL pipe on a simple example |     # test the EL pipe on a simple example | ||||||
|     to_test_pipeline = True |     to_test_pipeline = False | ||||||
| 
 | 
 | ||||||
|     # write the NLP object, read back in and test again |     # write the NLP object, read back in and test again | ||||||
|     to_write_nlp = True |     to_write_nlp = False | ||||||
|     to_read_nlp = False |     to_read_nlp = False | ||||||
|     test_from_file = False |     test_from_file = False | ||||||
| 
 | 
 | ||||||
|  | @ -138,9 +138,12 @@ def run_pipeline(): | ||||||
|     # STEP 6: create and train the entity linking pipe |     # STEP 6: create and train the entity linking pipe | ||||||
|     if train_pipe: |     if train_pipe: | ||||||
|         print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) |         print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) | ||||||
|  |         type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)} | ||||||
|  |         print(" -analysing", len(type_to_int), "different entity types") | ||||||
|         el_pipe = nlp_2.create_pipe(name='entity_linker', |         el_pipe = nlp_2.create_pipe(name='entity_linker', | ||||||
|                                     config={"context_width": CONTEXT_WIDTH, |                                     config={"context_width": CONTEXT_WIDTH, | ||||||
|                                             "pretrained_vectors": nlp_2.vocab.vectors.name}) |                                             "pretrained_vectors": nlp_2.vocab.vectors.name, | ||||||
|  |                                             "type_to_int": type_to_int}) | ||||||
|         el_pipe.set_kb(kb_2) |         el_pipe.set_kb(kb_2) | ||||||
|         nlp_2.add_pipe(el_pipe, last=True) |         nlp_2.add_pipe(el_pipe, last=True) | ||||||
| 
 | 
 | ||||||
|  | @ -151,8 +154,8 @@ def run_pipeline(): | ||||||
|             optimizer.L2 = L2 |             optimizer.L2 = L2 | ||||||
| 
 | 
 | ||||||
|         # define the size (nr of entities) of training and dev set |         # define the size (nr of entities) of training and dev set | ||||||
|         train_limit = 500000 |         train_limit = 50000 | ||||||
|         dev_limit = 5000 |         dev_limit = 50000 | ||||||
| 
 | 
 | ||||||
|         train_data = training_set_creator.read_training(nlp=nlp_2, |         train_data = training_set_creator.read_training(nlp=nlp_2, | ||||||
|                                                         training_dir=TRAINING_DIR, |                                                         training_dir=TRAINING_DIR, | ||||||
|  | @ -219,7 +222,7 @@ def run_pipeline(): | ||||||
|                 # measuring combined accuracy (prior + context) |                 # measuring combined accuracy (prior + context) | ||||||
|                 el_pipe.context_weight = 1 |                 el_pipe.context_weight = 1 | ||||||
|                 el_pipe.prior_weight = 1 |                 el_pipe.prior_weight = 1 | ||||||
|                 dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe) |                 dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False) | ||||||
|                 print("dev acc combo avg:", round(dev_acc_combo, 3), |                 print("dev acc combo avg:", round(dev_acc_combo, 3), | ||||||
|                       [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) |                       [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) | ||||||
| 
 | 
 | ||||||
|  | @ -264,7 +267,7 @@ def run_pipeline(): | ||||||
|         nlp_3 = spacy.load(NLP_2_DIR) |         nlp_3 = spacy.load(NLP_2_DIR) | ||||||
|         el_pipe = nlp_3.get_pipe("entity_linker") |         el_pipe = nlp_3.get_pipe("entity_linker") | ||||||
| 
 | 
 | ||||||
|         dev_limit = 10000 |         dev_limit = 5000 | ||||||
|         dev_data = training_set_creator.read_training(nlp=nlp_2, |         dev_data = training_set_creator.read_training(nlp=nlp_2, | ||||||
|                                                       training_dir=TRAINING_DIR, |                                                       training_dir=TRAINING_DIR, | ||||||
|                                                       dev=True, |                                                       dev=True, | ||||||
|  | @ -273,7 +276,7 @@ def run_pipeline(): | ||||||
|         print("Dev testing from file on", len(dev_data), "articles") |         print("Dev testing from file on", len(dev_data), "articles") | ||||||
|         print() |         print() | ||||||
| 
 | 
 | ||||||
|         dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe) |         dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe, error_analysis=False) | ||||||
|         print("dev acc combo avg:", round(dev_acc_combo, 3), |         print("dev acc combo avg:", round(dev_acc_combo, 3), | ||||||
|               [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) |               [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) | ||||||
| 
 | 
 | ||||||
|  | @ -281,7 +284,7 @@ def run_pipeline(): | ||||||
|     print("STOP", datetime.datetime.now()) |     print("STOP", datetime.datetime.now()) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _measure_accuracy(data, el_pipe=None): | def _measure_accuracy(data, el_pipe=None, error_analysis=False): | ||||||
|     # If the docs in the data require further processing with an entity linker, set el_pipe |     # If the docs in the data require further processing with an entity linker, set el_pipe | ||||||
|     correct_by_label = dict() |     correct_by_label = dict() | ||||||
|     incorrect_by_label = dict() |     incorrect_by_label = dict() | ||||||
|  | @ -312,6 +315,10 @@ def _measure_accuracy(data, el_pipe=None): | ||||||
|                     else: |                     else: | ||||||
|                         incorrect = incorrect_by_label.get(ent_label, 0) |                         incorrect = incorrect_by_label.get(ent_label, 0) | ||||||
|                         incorrect_by_label[ent_label] = incorrect + 1 |                         incorrect_by_label[ent_label] = incorrect + 1 | ||||||
|  |                         if error_analysis: | ||||||
|  |                             print(ent.text, "in", doc) | ||||||
|  |                             print("Predicted",  pred_entity, "should have been", gold_entity) | ||||||
|  |                             print() | ||||||
| 
 | 
 | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             print("Error assessing accuracy", e) |             print("Error assessing accuracy", e) | ||||||
|  |  | ||||||
|  | @ -652,7 +652,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, | ||||||
|     return model |     return model | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def build_nel_encoder(embed_width, hidden_width, **cfg): | def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): | ||||||
|     # TODO proper error |     # TODO proper error | ||||||
|     if "entity_width" not in cfg: |     if "entity_width" not in cfg: | ||||||
|         raise ValueError("entity_width not found") |         raise ValueError("entity_width not found") | ||||||
|  | @ -666,7 +666,7 @@ def build_nel_encoder(embed_width, hidden_width, **cfg): | ||||||
|     entity_width = cfg.get("entity_width") |     entity_width = cfg.get("entity_width") | ||||||
| 
 | 
 | ||||||
|     with Model.define_operators({">>": chain, "**": clone}): |     with Model.define_operators({">>": chain, "**": clone}): | ||||||
|         model = Affine(entity_width, entity_width+context_width+1)\ |         model = Affine(entity_width, entity_width+context_width+1+ner_types)\ | ||||||
|                 >> Affine(1, entity_width, drop_factor=0.0)\ |                 >> Affine(1, entity_width, drop_factor=0.0)\ | ||||||
|                 >> logistic |                 >> logistic | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1074,8 +1074,9 @@ class EntityLinker(Pipe): | ||||||
|     def Model(cls, **cfg): |     def Model(cls, **cfg): | ||||||
|         embed_width = cfg.get("embed_width", 300) |         embed_width = cfg.get("embed_width", 300) | ||||||
|         hidden_width = cfg.get("hidden_width", 128) |         hidden_width = cfg.get("hidden_width", 128) | ||||||
|  |         type_to_int = cfg.get("type_to_int", dict()) | ||||||
| 
 | 
 | ||||||
|         model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, **cfg) |         model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg) | ||||||
|         return model |         return model | ||||||
| 
 | 
 | ||||||
|     def __init__(self, **cfg): |     def __init__(self, **cfg): | ||||||
|  | @ -1086,6 +1087,7 @@ class EntityLinker(Pipe): | ||||||
|         self.context_weight = cfg.get("context_weight", 1) |         self.context_weight = cfg.get("context_weight", 1) | ||||||
|         self.prior_weight = cfg.get("prior_weight", 1) |         self.prior_weight = cfg.get("prior_weight", 1) | ||||||
|         self.context_width = cfg.get("context_width") |         self.context_width = cfg.get("context_width") | ||||||
|  |         self.type_to_int = cfg.get("type_to_int", dict()) | ||||||
| 
 | 
 | ||||||
|     def set_kb(self, kb): |     def set_kb(self, kb): | ||||||
|         self.kb = kb |         self.kb = kb | ||||||
|  | @ -1134,11 +1136,22 @@ class EntityLinker(Pipe): | ||||||
|         entity_encodings = [] |         entity_encodings = [] | ||||||
|         cats = [] |         cats = [] | ||||||
|         priors = [] |         priors = [] | ||||||
|  |         type_vectors = [] | ||||||
| 
 | 
 | ||||||
|         for doc, gold in zip(docs, golds): |         for doc, gold in zip(docs, golds): | ||||||
|  |             ents_by_offset = dict() | ||||||
|  |             for ent in doc.ents: | ||||||
|  |                 ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent | ||||||
|             for entity in gold.links: |             for entity in gold.links: | ||||||
|                 start, end, gold_kb = entity |                 start, end, gold_kb = entity | ||||||
|                 mention = doc.text[start:end] |                 mention = doc.text[start:end] | ||||||
|  | 
 | ||||||
|  |                 gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] | ||||||
|  |                 assert gold_ent is not None | ||||||
|  |                 type_vector = [0 for i in range(len(self.type_to_int))] | ||||||
|  |                 if len(self.type_to_int) > 0: | ||||||
|  |                     type_vector[self.type_to_int[gold_ent.label_]] = 1 | ||||||
|  | 
 | ||||||
|                 candidates = self.kb.get_candidates(mention) |                 candidates = self.kb.get_candidates(mention) | ||||||
|                 random.shuffle(candidates) |                 random.shuffle(candidates) | ||||||
|                 nr_neg = 0 |                 nr_neg = 0 | ||||||
|  | @ -1147,6 +1160,7 @@ class EntityLinker(Pipe): | ||||||
|                     entity_encoding = c.entity_vector |                     entity_encoding = c.entity_vector | ||||||
|                     entity_encodings.append(entity_encoding) |                     entity_encodings.append(entity_encoding) | ||||||
|                     context_docs.append(doc) |                     context_docs.append(doc) | ||||||
|  |                     type_vectors.append(type_vector) | ||||||
| 
 | 
 | ||||||
|                     if self.prior_weight > 0: |                     if self.prior_weight > 0: | ||||||
|                         priors.append([c.prior_prob]) |                         priors.append([c.prior_prob]) | ||||||
|  | @ -1160,12 +1174,12 @@ class EntityLinker(Pipe): | ||||||
|                         cats.append([0]) |                         cats.append([0]) | ||||||
| 
 | 
 | ||||||
|         if len(entity_encodings) > 0: |         if len(entity_encodings) > 0: | ||||||
|             assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) |             assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) == len(type_vectors) | ||||||
| 
 | 
 | ||||||
|             context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop) |             context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop) | ||||||
|             entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") |             entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") | ||||||
| 
 | 
 | ||||||
|             mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] |             mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + type_vectors[i] | ||||||
|                                  for i in range(len(entity_encodings))] |                                  for i in range(len(entity_encodings))] | ||||||
|             pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop) |             pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop) | ||||||
|             cats = self.model.ops.asarray(cats, dtype="float32") |             cats = self.model.ops.asarray(cats, dtype="float32") | ||||||
|  | @ -1225,6 +1239,10 @@ class EntityLinker(Pipe): | ||||||
|             if len(doc) > 0: |             if len(doc) > 0: | ||||||
|                 context_encoding = context_encodings[i] |                 context_encoding = context_encodings[i] | ||||||
|                 for ent in doc.ents: |                 for ent in doc.ents: | ||||||
|  |                     type_vector = [0 for i in range(len(self.type_to_int))] | ||||||
|  |                     if len(self.type_to_int) > 0: | ||||||
|  |                         type_vector[self.type_to_int[ent.label_]] = 1 | ||||||
|  | 
 | ||||||
|                     candidates = self.kb.get_candidates(ent.text) |                     candidates = self.kb.get_candidates(ent.text) | ||||||
|                     if candidates: |                     if candidates: | ||||||
|                         random.shuffle(candidates) |                         random.shuffle(candidates) | ||||||
|  | @ -1238,7 +1256,7 @@ class EntityLinker(Pipe): | ||||||
|                             entity_encodings = xp.asarray([c.entity_vector for c in candidates]) |                             entity_encodings = xp.asarray([c.entity_vector for c in candidates]) | ||||||
|                             assert len(entity_encodings) == len(prior_probs) |                             assert len(entity_encodings) == len(prior_probs) | ||||||
|                             mention_encodings = [list(context_encoding) + list(entity_encodings[i]) |                             mention_encodings = [list(context_encoding) + list(entity_encodings[i]) | ||||||
|                                                  + list(prior_probs[i]) |                                                  + list(prior_probs[i]) + type_vector | ||||||
|                                                  for i in range(len(entity_encodings))] |                                                  for i in range(len(entity_encodings))] | ||||||
|                             scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32")) |                             scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32")) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user