Fix get_matching_ents (#10451)

* Fix get_matching_ents Not sure what happened here - the code prior to this commit simply does not work. It's already covered by entity linker tests, which were succeeding in the NEL PR, but couldn't possibly succeed on master. * Fix test Test was indented inside another test and so doesn't seem to have been running properly.
2025-07-18 20:22:25 +03:00 · 2022-03-08 00:56:57 +09:00 · 2022-03-08 00:56:57 +09:00 · 61ba5450ff
commit 61ba5450ff
parent 7ed7908716
2 changed files with 56 additions and 56 deletions
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -1009,65 +1009,65 @@ def test_legacy_architectures(name, config):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

-    @pytest.mark.parametrize("patterns", [
-        # perfect case
-        [{"label": "CHARACTER", "pattern": "Kirby"}],
-        # typo for false negative
-        [{"label": "PERSON", "pattern": "Korby"}],
-        # random stuff for false positive
-        [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
-        ]
-    )
-    def test_no_gold_ents(patterns):
-        # test that annotating components work
-        TRAIN_DATA = [
-            (
-                "Kirby is pink",
-                {
-                    "links": {(0, 5): {"Q613241": 1.0}},
-                    "entities": [(0, 5, "CHARACTER")],
-                    "sent_starts": [1, 0, 0],
-                },
-            )
-        ]
-        nlp = English()
-        vector_length = 3
-        train_examples = []
-        for text, annotation in TRAIN_DATA:
-            doc = nlp(text)
-            train_examples.append(Example.from_dict(doc, annotation))
+@pytest.mark.parametrize("patterns", [
+    # perfect case
+    [{"label": "CHARACTER", "pattern": "Kirby"}],
+    # typo for false negative
+    [{"label": "PERSON", "pattern": "Korby"}],
+    # random stuff for false positive
+    [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
+    ]
+)
+def test_no_gold_ents(patterns):
+    # test that annotating components work
+    TRAIN_DATA = [
+        (
+            "Kirby is pink",
+            {
+                "links": {(0, 5): {"Q613241": 1.0}},
+                "entities": [(0, 5, "CHARACTER")],
+                "sent_starts": [1, 0, 0],
+            },
+        )
+    ]
+    nlp = English()
+    vector_length = 3
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        train_examples.append(Example.from_dict(doc, annotation))

-        # Create a ruler to mark entities
-        ruler = nlp.add_pipe("entity_ruler")
-        ruler.add_patterns(patterns)
+    # Create a ruler to mark entities
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)

-        # Apply ruler to examples. In a real pipeline this would be an annotating component.
-        for eg in train_examples:
-            eg.predicted = ruler(eg.predicted)
+    # Apply ruler to examples. In a real pipeline this would be an annotating component.
+    for eg in train_examples:
+        eg.predicted = ruler(eg.predicted)

-        def create_kb(vocab):
-            # create artificial KB
-            mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
-            mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
-            mykb.add_alias("Kirby", ["Q613241"], [0.9])
-            # Placeholder
-            mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
-            mykb.add_alias("pink", ["pink"], [0.9])
-            return mykb
+    def create_kb(vocab):
+        # create artificial KB
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_alias("Kirby", ["Q613241"], [0.9])
+        # Placeholder
+        mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
+        mykb.add_alias("pink", ["pink"], [0.9])
+        return mykb


-        # Create and train the Entity Linker
-        entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
-        entity_linker.set_kb(create_kb)
-        assert entity_linker.use_gold_ents == False
+    # Create and train the Entity Linker
+    entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
+    entity_linker.set_kb(create_kb)
+    assert entity_linker.use_gold_ents == False

-        optimizer = nlp.initialize(get_examples=lambda: train_examples)
-        for i in range(2):
-            losses = {}
-            nlp.update(train_examples, sgd=optimizer, losses=losses)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(2):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)

-        # adding additional components that are required for the entity_linker
-        nlp.add_pipe("sentencizer", first=True)
+    # adding additional components that are required for the entity_linker
+    nlp.add_pipe("sentencizer", first=True)

-        # this will run the pipeline on the examples and shouldn't crash
-        results = nlp.evaluate(train_examples)
+    # this will run the pipeline on the examples and shouldn't crash
+    results = nlp.evaluate(train_examples)
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -263,11 +263,11 @@ cdef class Example:
        kept. Otherwise only the character indices need to match.
        """
        gold = {}
-        for ent in self.reference:
+        for ent in self.reference.ents:
            gold[(ent.start_char, ent.end_char)] = ent.label

        keep = []
-        for ent in self.predicted:
+        for ent in self.predicted.ents:
            key = (ent.start_char, ent.end_char)
            if key not in gold:
                continue