Clean up util code

Moved everything into coref_util.py, deleted wl-specific file.
2025-09-18 01:52:37 +03:00 · 2022-03-15 19:59:44 +09:00 · 2022-03-15 19:59:44 +09:00 · abdc7d87af
commit abdc7d87af
parent 55039a66ad
3 changed files with 32 additions and 136 deletions
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@ -458,7 +458,7 @@ import torch
 from thinc.util import xp2torch, torch2xp

 # TODO rename this to coref_util
-from .coref_util_wl import add_dummy
+from .coref_util import add_dummy

 # TODO rename to plain coref
@registry.architectures("spacy.WLCoref.v1")
--- a/spacy/ml/models/coref_util.py
+++ b/spacy/ml/models/coref_util.py
@ -1,13 +1,43 @@
 from thinc.types import Ints2d
 from spacy.tokens import Doc
-from typing import List, Tuple, Callable, Any
+from typing import List, Tuple, Callable, Any, Set, Dict
 from ...util import registry
+import torch

 # type alias to make writing this less tedious
 MentionClusters = List[List[Tuple[int, int]]]

 DEFAULT_CLUSTER_PREFIX = "coref_clusters"

+EPSILON = 1e-7
+
+class GraphNode:
+    def __init__(self, node_id: int):
+        self.id = node_id
+        self.links: Set[GraphNode] = set()
+        self.visited = False
+
+    def link(self, another: "GraphNode"):
+        self.links.add(another)
+        another.links.add(self)
+
+    def __repr__(self) -> str:
+        return str(self.id)
+
+
+def add_dummy(tensor: torch.Tensor, eps: bool = False):
+    """ Prepends zeros (or a very small value if eps is True)
+    to the first (not zeroth) dimension of tensor.
+    """
+    kwargs = dict(device=tensor.device, dtype=tensor.dtype)
+    shape: List[int] = list(tensor.shape)
+    shape[1] = 1
+    if not eps:
+        dummy = torch.zeros(shape, **kwargs)          # type: ignore
+    else:
+        dummy = torch.full(shape, EPSILON, **kwargs)  # type: ignore
+    output = torch.cat((dummy, tensor), dim=1)
+    return output

 def doc2clusters(doc: Doc, prefix=DEFAULT_CLUSTER_PREFIX) -> MentionClusters:
    """Given a doc, give the mention clusters.
--- a/spacy/ml/models/coref_util_wl.py
+++ b/spacy/ml/models/coref_util_wl.py
@ -1,134 +0,0 @@
-""" Contains functions not directly linked to coreference resolution """
-
-from typing import List, Set, Dict, Tuple
-from thinc.types import Ints1d
-from dataclasses import dataclass
-from ...tokens import Doc
-from ...language import Language
-
-import torch
-
-EPSILON = 1e-7
-
-class GraphNode:
-    def __init__(self, node_id: int):
-        self.id = node_id
-        self.links: Set[GraphNode] = set()
-        self.visited = False
-
-    def link(self, another: "GraphNode"):
-        self.links.add(another)
-        another.links.add(self)
-
-    def __repr__(self) -> str:
-        return str(self.id)
-
-
-def add_dummy(tensor: torch.Tensor, eps: bool = False):
-    """ Prepends zeros (or a very small value if eps is True)
-    to the first (not zeroth) dimension of tensor.
-    """
-    kwargs = dict(device=tensor.device, dtype=tensor.dtype)
-    shape: List[int] = list(tensor.shape)
-    shape[1] = 1
-    if not eps:
-        dummy = torch.zeros(shape, **kwargs)          # type: ignore
-    else:
-        dummy = torch.full(shape, EPSILON, **kwargs)  # type: ignore
-    output = torch.cat((dummy, tensor), dim=1)
-    return output
-
-
-
-# TODO replace with spaCy config
-@dataclass
-class CorefConfig:  # pylint: disable=too-many-instance-attributes, too-few-public-methods
-    """ Contains values needed to set up the coreference model. """
-    section: str
-
-    data_dir: str
-
-    train_data: str
-    dev_data: str
-    test_data: str
-
-    device: str
-
-    bert_model: str
-    bert_window_size: int
-
-    embedding_size: int
-    sp_embedding_size: int
-    a_scoring_batch_size: int
-    hidden_size: int
-    n_hidden_layers: int
-
-    max_span_len: int
-
-    rough_k: int
-
-    bert_finetune: bool
-    bert_mini_finetune: bool
-    dropout_rate: float
-    learning_rate: float
-    bert_learning_rate: float
-    train_epochs: int
-    bce_loss_weight: float
-
-    tokenizer_kwargs: Dict[str, dict]
-    conll_log_dir: str
-
-
-def get_sent_ids(doc):
-    sid = 0
-    sids = []
-    for sent in doc.sents:
-        for tok in sent:
-            sids.append(sid)
-        sid += 1
-    return sids
-
-def get_cluster_ids(doc):
-    """Get the cluster ids of head tokens."""
-
-    out = [0] * len(doc)
-    head_spangroups = [doc.spans[sk] for sk in doc.spans if sk.startswith("coref_word_clusters")]
-    for ii, group in enumerate(head_spangroups, start=1):
-        for span in group:
-            out[span[0].i] = ii
-
-    return out
-
-def get_head2span(doc):
-    out = []
-    for sk in doc.spans:
-        if not sk.startswith("coref_clusters"):
-            continue
-
-        if len(doc.spans[sk]) == 1:
-            print("===== UNARY MENTION ====")
-
-        for span in doc.spans[sk]:
-            out.append( (span.root.i, span.start, span.end) )
-    return out
-
-
-def doc2tensors(
-    xp,
-    doc: Doc
-) -> Tuple[Ints1d, Ints1d, Ints1d, Ints1d, Ints1d]:
-    sent_ids = get_sent_ids(doc)
-    cluster_ids = get_cluster_ids(doc)
-    head2span = get_head2span(doc)
-
-
-    if not head2span:
-        heads, starts, ends = [], [], []
-    else:
-        heads, starts, ends = zip(*head2span)
-    sent_ids = xp.asarray(sent_ids)
-    cluster_ids = xp.asarray(cluster_ids)
-    heads = xp.asarray(heads)
-    starts = xp.asarray(starts)
-    ends = xp.asarray(ends) - 1
-    return sent_ids, cluster_ids, heads, starts, ends