Add test for candidate stream processing. Simplify processing of candidate streams.

2025-11-03 09:27:56 +03:00 · 2022-10-27 13:10:23 +02:00 · 2022-10-27 13:10:23 +02:00 · ba91d0d1d9
commit ba91d0d1d9
parent f5f20b0b92
2 changed files with 64 additions and 2 deletions
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -494,7 +494,7 @@ class EntityLinker(TrainablePipe):
            if len(doc) == 0 or len(doc.ents) == 0:
                continue
            sentences = [s for s in doc.sents]
-            doc_ent_cands = list(next(all_ent_cands)) if len(doc.ents) else []
+            doc_ent_cands = list(next(all_ent_cands))
            # Looping over candidate entities for this doc. (TODO: rewrite)
            for ent_cand_idx, ent in enumerate(doc.ents):
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -179,7 +179,7 @@ def test_no_entities():
            {
                "sent_starts": [1, 0, 0, 0, 0],
            },
-        )
+        ),
    ]
    nlp = English()
    vector_length = 3
@ -1209,3 +1209,65 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
    assert len(doc.ents) == 1
    assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
 def test_nel_candidate_processing():
    """Test that NEL handles candidate streams correctly in a set of documents with & without entities as well as empty
    documents.
    """
    train_data = [
        (
            "The sky over New York is blue.",
            {
                "sent_starts": [1, 0, 0, 0, 0, 0, 0, 0],
            },
        ),
        (
            "They visited New York.",
            {
                "sent_starts": [1, 0, 0, 0, 0],
            },
        ),
        # (
        #     "",
        #     {}
        # ),
        # (
        #     "New York is a city.",
        #     {
        #         "sent_starts": [1, 0, 0, 0, 0, 0],
        #     }
        # ),
    ]
    nlp = English()
    # Add a custom rule-based component to mimick NER
    ruler = nlp.add_pipe("entity_ruler", last=True)
    ruler.add_patterns([{"label": "GPE", "pattern": [{"LOWER": "new york"}]}])  # type: ignore
    vector_length = 3
    train_examples = []
    for text, annotation in train_data:
        doc = nlp(text)
        train_examples.append(Example.from_dict(doc, annotation))
    def create_kb(vocab):
        # create artificial KB
        mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q60", freq=12, entity_vector=[1, 2, 3])
        mykb.add_alias("New York", ["Q60"], [0.9])
        return mykb
    # Create and train the Entity Linker
    entity_linker = nlp.add_pipe("entity_linker", last=True)
    entity_linker.set_kb(create_kb)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    # adding additional components that are required for the entity_linker
    nlp.add_pipe("sentencizer", first=True)
    # this will run the pipeline on the examples and shouldn't crash
    nlp.evaluate(train_examples)