mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Fix get_matching_ents (#10451)
* Fix get_matching_ents Not sure what happened here - the code prior to this commit simply does not work. It's already covered by entity linker tests, which were succeeding in the NEL PR, but couldn't possibly succeed on master. * Fix test Test was indented inside another test and so doesn't seem to have been running properly.
This commit is contained in:
		
							parent
							
								
									7ed7908716
								
							
						
					
					
						commit
						61ba5450ff
					
				|  | @ -1009,65 +1009,65 @@ def test_legacy_architectures(name, config): | |||
|         losses = {} | ||||
|         nlp.update(train_examples, sgd=optimizer, losses=losses) | ||||
| 
 | ||||
|     @pytest.mark.parametrize("patterns", [ | ||||
|         # perfect case | ||||
|         [{"label": "CHARACTER", "pattern": "Kirby"}], | ||||
|         # typo for false negative | ||||
|         [{"label": "PERSON", "pattern": "Korby"}], | ||||
|         # random stuff for false positive | ||||
|         [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}], | ||||
|         ] | ||||
|     ) | ||||
|     def test_no_gold_ents(patterns): | ||||
|         # test that annotating components work | ||||
|         TRAIN_DATA = [ | ||||
|             ( | ||||
|                 "Kirby is pink", | ||||
|                 { | ||||
|                     "links": {(0, 5): {"Q613241": 1.0}}, | ||||
|                     "entities": [(0, 5, "CHARACTER")], | ||||
|                     "sent_starts": [1, 0, 0], | ||||
|                 }, | ||||
|             ) | ||||
|         ] | ||||
|         nlp = English() | ||||
|         vector_length = 3 | ||||
|         train_examples = [] | ||||
|         for text, annotation in TRAIN_DATA: | ||||
|             doc = nlp(text) | ||||
|             train_examples.append(Example.from_dict(doc, annotation)) | ||||
| @pytest.mark.parametrize("patterns", [ | ||||
|     # perfect case | ||||
|     [{"label": "CHARACTER", "pattern": "Kirby"}], | ||||
|     # typo for false negative | ||||
|     [{"label": "PERSON", "pattern": "Korby"}], | ||||
|     # random stuff for false positive | ||||
|     [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}], | ||||
|     ] | ||||
| ) | ||||
| def test_no_gold_ents(patterns): | ||||
|     # test that annotating components work | ||||
|     TRAIN_DATA = [ | ||||
|         ( | ||||
|             "Kirby is pink", | ||||
|             { | ||||
|                 "links": {(0, 5): {"Q613241": 1.0}}, | ||||
|                 "entities": [(0, 5, "CHARACTER")], | ||||
|                 "sent_starts": [1, 0, 0], | ||||
|             }, | ||||
|         ) | ||||
|     ] | ||||
|     nlp = English() | ||||
|     vector_length = 3 | ||||
|     train_examples = [] | ||||
|     for text, annotation in TRAIN_DATA: | ||||
|         doc = nlp(text) | ||||
|         train_examples.append(Example.from_dict(doc, annotation)) | ||||
| 
 | ||||
|         # Create a ruler to mark entities | ||||
|         ruler = nlp.add_pipe("entity_ruler") | ||||
|         ruler.add_patterns(patterns) | ||||
|     # Create a ruler to mark entities | ||||
|     ruler = nlp.add_pipe("entity_ruler") | ||||
|     ruler.add_patterns(patterns) | ||||
| 
 | ||||
|         # Apply ruler to examples. In a real pipeline this would be an annotating component. | ||||
|         for eg in train_examples: | ||||
|             eg.predicted = ruler(eg.predicted) | ||||
|     # Apply ruler to examples. In a real pipeline this would be an annotating component. | ||||
|     for eg in train_examples: | ||||
|         eg.predicted = ruler(eg.predicted) | ||||
| 
 | ||||
|         def create_kb(vocab): | ||||
|             # create artificial KB | ||||
|             mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) | ||||
|             mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3]) | ||||
|             mykb.add_alias("Kirby", ["Q613241"], [0.9]) | ||||
|             # Placeholder | ||||
|             mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5]) | ||||
|             mykb.add_alias("pink", ["pink"], [0.9]) | ||||
|             return mykb | ||||
|     def create_kb(vocab): | ||||
|         # create artificial KB | ||||
|         mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) | ||||
|         mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3]) | ||||
|         mykb.add_alias("Kirby", ["Q613241"], [0.9]) | ||||
|         # Placeholder | ||||
|         mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5]) | ||||
|         mykb.add_alias("pink", ["pink"], [0.9]) | ||||
|         return mykb | ||||
| 
 | ||||
| 
 | ||||
|         # Create and train the Entity Linker | ||||
|         entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True) | ||||
|         entity_linker.set_kb(create_kb) | ||||
|         assert entity_linker.use_gold_ents == False | ||||
|     # Create and train the Entity Linker | ||||
|     entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True) | ||||
|     entity_linker.set_kb(create_kb) | ||||
|     assert entity_linker.use_gold_ents == False | ||||
| 
 | ||||
|         optimizer = nlp.initialize(get_examples=lambda: train_examples) | ||||
|         for i in range(2): | ||||
|             losses = {} | ||||
|             nlp.update(train_examples, sgd=optimizer, losses=losses) | ||||
|     optimizer = nlp.initialize(get_examples=lambda: train_examples) | ||||
|     for i in range(2): | ||||
|         losses = {} | ||||
|         nlp.update(train_examples, sgd=optimizer, losses=losses) | ||||
| 
 | ||||
|         # adding additional components that are required for the entity_linker | ||||
|         nlp.add_pipe("sentencizer", first=True) | ||||
|     # adding additional components that are required for the entity_linker | ||||
|     nlp.add_pipe("sentencizer", first=True) | ||||
| 
 | ||||
|         # this will run the pipeline on the examples and shouldn't crash | ||||
|         results = nlp.evaluate(train_examples) | ||||
|     # this will run the pipeline on the examples and shouldn't crash | ||||
|     results = nlp.evaluate(train_examples) | ||||
|  |  | |||
|  | @ -263,11 +263,11 @@ cdef class Example: | |||
|         kept. Otherwise only the character indices need to match. | ||||
|         """ | ||||
|         gold = {} | ||||
|         for ent in self.reference: | ||||
|         for ent in self.reference.ents: | ||||
|             gold[(ent.start_char, ent.end_char)] = ent.label | ||||
| 
 | ||||
|         keep = [] | ||||
|         for ent in self.predicted: | ||||
|         for ent in self.predicted.ents: | ||||
|             key = (ent.start_char, ent.end_char) | ||||
|             if key not in gold: | ||||
|                 continue | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user