avoid repetitive entities in the output

2025-08-09 06:34:54 +03:00 · 2021-05-28 16:52:51 +02:00 · 2021-05-28 16:52:51 +02:00 · 0aa1083ce8
commit 0aa1083ce8
parent 0d81bce9cc
3 changed files with 35 additions and 10 deletions
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@ -379,7 +379,7 @@ def ant_scorer_forward(

        scores = pw_prod + pw_sum + mask

-        top_scores, top_scores_idx = topk(xp, scores, ant_limit)
+        top_scores, top_scores_idx = topk(xp, scores, min(ant_limit, len(scores)))
        out.append((top_scores, top_scores_idx))

        # In the full model these scores can be further refined. In the current
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@ -109,16 +109,15 @@ def get_predicted_clusters(

 def get_sentence_map(doc: Doc):
    """For the given span, return a list of sentence indexes."""
-
-    try:
+    if doc.is_sentenced:
        si = 0
        out = []
        for sent in doc.sents:
-            for tok in sent:
+            for _ in sent:
                out.append(si)
            si += 1
        return out
-    except ValueError:
+    else:
        # If there are no sents then just return dummy values.
        # Shouldn't happen in general training, but typical in init.
        return [0] * len(doc)
@ -198,8 +197,9 @@ def select_non_crossing_spans(

    # sort idxs by order in doc
    selected = sorted(selected, key=lambda idx: (starts[idx], ends[idx]))
-    while len(selected) < limit:
-        selected.append(selected[0])  # this seems a bit weird?
+    # This was causing many repetitive entities in the output - removed for now
+    # while len(selected) < limit:
+    #     selected.append(selected[0])  # this seems a bit weird?
    return selected


--- a/spacy/tests/pipeline/test_coref.py
+++ b/spacy/tests/pipeline/test_coref.py
@ -1,4 +1,6 @@
 import pytest
+import spacy
+
 from spacy import util
 from spacy.training import Example
 from spacy.lang.en import English
@ -50,8 +52,9 @@ def test_initialized(nlp):
    assert nlp.pipe_names == ["coref"]
    text = "She gave me her pen."
    doc = nlp(text)
-    # TODO: The results of this are weird & non-deterministic
-    print(doc.spans)
+    for k, v in doc.spans.items():
+        # Ensure there are no "She, She, She, She, She, ..." problems
+        assert len(v) <= 15


 def test_initialized_short(nlp):
@ -73,6 +76,28 @@ def test_initialized_2(nlp):
    print(nlp(text).spans)


+def test_coref_serialization(nlp):
+    # Test that the coref component can be serialized
+    nlp.add_pipe("coref", last=True)
+    nlp.initialize()
+    assert nlp.pipe_names == ["coref"]
+    text = "She gave me her pen."
+    doc = nlp(text)
+    spans_result = doc.spans
+
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = spacy.load(tmp_dir)
+        assert nlp2.pipe_names == ["coref"]
+        doc2 = nlp2(text)
+        spans_result2 = doc2.spans
+        print(1, [(k, len(v)) for k, v in spans_result.items()])
+        print(2, [(k, len(v)) for k, v in spans_result2.items()])
+        for k, v in spans_result.items():
+            assert spans_result[k] == spans_result2[k]
+        # assert spans_result == spans_result2
+
+
 def test_overfitting_IO(nlp):
    # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly
    train_examples = []
@ -90,7 +115,7 @@ def test_overfitting_IO(nlp):
        nlp.update(train_examples, sgd=optimizer, losses=losses)
        doc = nlp(test_text)
        print(i, doc.spans)
-    print(losses["coref"]) # < 0.001
+    print(losses["coref"])  # < 0.001

    # test the trained model
    doc = nlp(test_text)