Make get_sentence_map work with init

When sentences are not available, just treat the whole doc as one sentence. A reasonable general fallback, but important due to the init call, where upstream components aren't run.
2025-08-09 14:44:52 +03:00 · 2021-05-18 19:54:54 +09:00 · 2021-05-18 19:54:54 +09:00 · a7d9c8156d
commit a7d9c8156d
parent 883c137b26
2 changed files with 15 additions and 9 deletions
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@ -145,6 +145,7 @@ def span_embeddings_forward(

    tokvecs, docs = inputs

+    #TODO fix this
    dim = tokvecs[0].shape[1]

    get_mentions = model.attrs["get_mentions"]
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@ -1,6 +1,6 @@
 from thinc.types import Ints2d
 from spacy.tokens import Doc
-from typing import List, Tuple, Callable
+from typing import List, Tuple, Callable, Any
 from ...util import registry

 # type alias to make writing this less tedious
@ -109,13 +109,18 @@ def get_predicted_clusters(
 def get_sentence_map(doc: Doc):
    """For the given span, return a list of sentence indexes."""

-    si = 0
-    out = []
-    for sent in doc.sents:
-        for tok in sent:
-            out.append(si)
-        si += 1
-    return out
+    try:
+        si = 0
+        out = []
+        for sent in doc.sents:
+            for tok in sent:
+                out.append(si)
+            si += 1
+        return out
+    except ValueError:
+        # If there are no sents then just return dummy values.
+        # Shouldn't happen in general training, but typical in init.
+        return [0] * len(doc)


 def get_candidate_mentions(
@ -144,7 +149,7 @@ def get_candidate_mentions(


@registry.misc("spacy.CorefCandidateGenerator.v0")
-def create_mention_generator() -> Callable:
+def create_mention_generator() -> Any:
    return get_candidate_mentions