Formatting

2025-08-08 14:14:57 +03:00 · 2022-05-10 19:09:52 +09:00 · 2022-05-10 19:09:52 +09:00 · 33f4f90ff0
commit 33f4f90ff0
parent 41fc092674
4 changed files with 58 additions and 60 deletions
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@ -64,7 +64,6 @@ def build_wl_coref_model(
    return coref_model


-
 def convert_coref_scorer_inputs(model: Model, X: List[Floats2d], is_train: bool):
    # The input here is List[Floats2d], one for each doc
    # just use the first
@ -96,7 +95,6 @@ def convert_coref_scorer_outputs(model: Model, inputs_outputs, is_train: bool):
    return (scores_xp, indices_xp), convert_for_torch_backward


-
 # TODO add docstring for this, maybe move to utils.
 # This might belong in the component.
 def _clusterize(model, scores: Floats2d, top_indices: Ints2d):
@ -126,7 +124,6 @@ def _clusterize(model, scores: Floats2d, top_indices: Ints2d):
    return sorted(clusters)


-
 class CorefScorer(torch.nn.Module):
    """Combines all coref modules together to find coreferent spans.

@ -372,8 +369,6 @@ class RoughScorer(torch.nn.Module):
        return top_scores, indices


-
-
 class DistancePairwiseEncoder(torch.nn.Module):
    def __init__(self, embedding_size, dropout_rate):
        super().__init__()
--- a/spacy/ml/models/span_predictor.py
+++ b/spacy/ml/models/span_predictor.py
@ -10,6 +10,7 @@ from ...tokens import Doc
 from ...util import registry
 from .coref_util import get_sentence_ids

+
@registry.architectures("spacy.SpanPredictor.v1")
 def build_span_predictor(
    tok2vec: Model[List[Doc], List[Floats2d]],
@ -34,6 +35,7 @@ def build_span_predictor(

    return model

+
 def convert_span_predictor_inputs(
    model: Model, X: Tuple[Ints1d, Floats2d, Ints1d], is_train: bool
 ):
@ -89,6 +91,38 @@ def predict_span_clusters(

    return [[head2span[head] for head in cluster] for cluster in clusters]

+
+def build_get_head_metadata(prefix):
+    # TODO this name is awful, fix it
+    model = Model(
+        "HeadDataProvider", attrs={"prefix": prefix}, forward=head_data_forward
+    )
+    return model
+
+
+def head_data_forward(model, docs, is_train):
+    """A layer to generate the extra data needed for the span predictor."""
+    sent_ids = []
+    head_ids = []
+    prefix = model.attrs["prefix"]
+    for doc in docs:
+        sids = model.ops.asarray2i(get_sentence_ids(doc))
+        sent_ids.append(sids)
+        heads = []
+        for key, sg in doc.spans.items():
+            if not key.startswith(prefix):
+                continue
+            for span in sg:
+                # TODO warn if spans are more than one token
+                heads.append(span[0].i)
+        heads = model.ops.asarray2i(heads)
+        head_ids.append(heads)
+    # each of these is a list with one entry per doc
+    # backprop is just a placeholder
+    # TODO it would probably be better to have a list of tuples than two lists of arrays
+    return (sent_ids, head_ids), lambda x: []
+
+
 # TODO this should maybe have a different name from the component
 class SpanPredictor(torch.nn.Module):
    def __init__(self, input_size: int, hidden_size: int, dist_emb_size: int):
@ -181,35 +215,3 @@ class SpanPredictor(torch.nn.Module):
            valid_positions = torch.stack((valid_starts, valid_ends), dim=2)
            return scores + valid_positions
        return scores
-
-
-def build_get_head_metadata(prefix):
-    # TODO this name is awful, fix it
-    model = Model(
-        "HeadDataProvider", attrs={"prefix": prefix}, forward=head_data_forward
-    )
-    return model
-
-
-def head_data_forward(model, docs, is_train):
-    """A layer to generate the extra data needed for the span predictor."""
-    sent_ids = []
-    head_ids = []
-    prefix = model.attrs["prefix"]
-    for doc in docs:
-        sids = model.ops.asarray2i(get_sentence_ids(doc))
-        sent_ids.append(sids)
-        heads = []
-        for key, sg in doc.spans.items():
-            if not key.startswith(prefix):
-                continue
-            for span in sg:
-                # TODO warn if spans are more than one token
-                heads.append(span[0].i)
-        heads = model.ops.asarray2i(heads)
-        head_ids.append(heads)
-    # each of these is a list with one entry per doc
-    # backprop is just a placeholder
-    # TODO it would probably be better to have a list of tuples than two lists of arrays
-    return (sent_ids, head_ids), lambda x: []
-
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@ -84,7 +84,6 @@ def make_coref(
    )


-
 class CoreferenceResolver(TrainablePipe):
    """Pipeline component for coreference resolution.

@ -318,7 +317,7 @@ class CoreferenceResolver(TrainablePipe):
            log_marg = ops.softmax(cscores + ops.xp.log(top_gscores), axis=1)
        log_norm = ops.softmax(cscores, axis=1)
        grad = log_norm - log_marg
-        #gradients.append((grad, cidx))
+        # gradients.append((grad, cidx))
        loss = float((grad**2).sum())

        return loss, grad
@ -373,5 +372,3 @@ class CoreferenceResolver(TrainablePipe):
            "coref_r": evaluator.get_recall(),
        }
        return score
-
-
--- a/spacy/pipeline/span_predictor.py
+++ b/spacy/pipeline/span_predictor.py
@ -46,26 +46,30 @@ depth = 2
 """
 DEFAULT_SPAN_PREDICTOR_MODEL = Config().from_str(default_span_predictor_config)["model"]

+
@Language.factory(
-        "span_predictor",
-        assigns=["doc.spans"],
-        requires=["doc.spans"],
-        default_config={
-            "model": DEFAULT_SPAN_PREDICTOR_MODEL,
-            "input_prefix": "coref_head_clusters",
-            "output_prefix": "coref_clusters",
-            },
+    "span_predictor",
+    assigns=["doc.spans"],
+    requires=["doc.spans"],
+    default_config={
+        "model": DEFAULT_SPAN_PREDICTOR_MODEL,
+        "input_prefix": "coref_head_clusters",
+        "output_prefix": "coref_clusters",
+    },
    default_score_weights={"span_accuracy": 1.0},
-    )
+)
 def make_span_predictor(
-        nlp: Language,
-        name: str,
-        model,
-        input_prefix: str = "coref_head_clusters",
-        output_prefix: str = "coref_clusters",
+    nlp: Language,
+    name: str,
+    model,
+    input_prefix: str = "coref_head_clusters",
+    output_prefix: str = "coref_clusters",
 ) -> "SpanPredictor":
    """Create a SpanPredictor component."""
-    return SpanPredictor(nlp.vocab, model, name, input_prefix=input_prefix, output_prefix=output_prefix)
+    return SpanPredictor(
+        nlp.vocab, model, name, input_prefix=input_prefix, output_prefix=output_prefix
+    )
+

 class SpanPredictor(TrainablePipe):
    """Pipeline component to resolve one-token spans to full spans.
@ -125,7 +129,7 @@ class SpanPredictor(TrainablePipe):
    def set_annotations(self, docs: Iterable[Doc], clusters_by_doc) -> None:
        for doc, clusters in zip(docs, clusters_by_doc):
            for ii, cluster in enumerate(clusters):
-                spans = [doc[mm[0]:mm[1]] for mm in cluster]
+                spans = [doc[mm[0] : mm[1]] for mm in cluster]
                doc.spans[f"{self.output_prefix}_{ii}"] = spans

    def update(
@ -218,10 +222,10 @@ class SpanPredictor(TrainablePipe):
            end_probs = ops.softmax(end_scores, axis=1)
            start_targets = to_categorical(starts, n_classes)
            end_targets = to_categorical(ends, n_classes)
-            start_grads = (start_probs - start_targets)
-            end_grads = (end_probs - end_targets)
+            start_grads = start_probs - start_targets
+            end_grads = end_probs - end_targets
            grads = ops.xp.stack((start_grads, end_grads), axis=2)
-            loss = float((grads ** 2).sum())
+            loss = float((grads**2).sum())
        return loss, grads

    def initialize(