Formatting

2025-09-19 18:42:37 +03:00 · 2022-05-10 19:09:52 +09:00 · 2022-05-10 19:09:52 +09:00 · 33f4f90ff0
commit 33f4f90ff0
parent 41fc092674
4 changed files with 58 additions and 60 deletions
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@ -64,7 +64,6 @@ def build_wl_coref_model(
    return coref_model
 def convert_coref_scorer_inputs(model: Model, X: List[Floats2d], is_train: bool):
    # The input here is List[Floats2d], one for each doc
    # just use the first
@ -96,7 +95,6 @@ def convert_coref_scorer_outputs(model: Model, inputs_outputs, is_train: bool):
    return (scores_xp, indices_xp), convert_for_torch_backward
 # TODO add docstring for this, maybe move to utils.
 # This might belong in the component.
 def _clusterize(model, scores: Floats2d, top_indices: Ints2d):
@ -126,7 +124,6 @@ def _clusterize(model, scores: Floats2d, top_indices: Ints2d):
    return sorted(clusters)
 class CorefScorer(torch.nn.Module):
    """Combines all coref modules together to find coreferent spans.
@ -372,8 +369,6 @@ class RoughScorer(torch.nn.Module):
        return top_scores, indices
 class DistancePairwiseEncoder(torch.nn.Module):
    def __init__(self, embedding_size, dropout_rate):
        super().__init__()
--- a/spacy/ml/models/span_predictor.py
+++ b/spacy/ml/models/span_predictor.py
@ -10,6 +10,7 @@ from ...tokens import Doc
 from ...util import registry
 from .coref_util import get_sentence_ids
@registry.architectures("spacy.SpanPredictor.v1")
 def build_span_predictor(
    tok2vec: Model[List[Doc], List[Floats2d]],
@ -34,6 +35,7 @@ def build_span_predictor(
    return model
 def convert_span_predictor_inputs(
    model: Model, X: Tuple[Ints1d, Floats2d, Ints1d], is_train: bool
 ):
@ -89,6 +91,38 @@ def predict_span_clusters(
    return [[head2span[head] for head in cluster] for cluster in clusters]
 def build_get_head_metadata(prefix):
    # TODO this name is awful, fix it
    model = Model(
        "HeadDataProvider", attrs={"prefix": prefix}, forward=head_data_forward
    )
    return model
 def head_data_forward(model, docs, is_train):
    """A layer to generate the extra data needed for the span predictor."""
    sent_ids = []
    head_ids = []
    prefix = model.attrs["prefix"]
    for doc in docs:
        sids = model.ops.asarray2i(get_sentence_ids(doc))
        sent_ids.append(sids)
        heads = []
        for key, sg in doc.spans.items():
            if not key.startswith(prefix):
                continue
            for span in sg:
                # TODO warn if spans are more than one token
                heads.append(span[0].i)
        heads = model.ops.asarray2i(heads)
        head_ids.append(heads)
    # each of these is a list with one entry per doc
    # backprop is just a placeholder
    # TODO it would probably be better to have a list of tuples than two lists of arrays
    return (sent_ids, head_ids), lambda x: []
 # TODO this should maybe have a different name from the component
 class SpanPredictor(torch.nn.Module):
    def __init__(self, input_size: int, hidden_size: int, dist_emb_size: int):
@ -181,35 +215,3 @@ class SpanPredictor(torch.nn.Module):
            valid_positions = torch.stack((valid_starts, valid_ends), dim=2)
            return scores + valid_positions
        return scores
 def build_get_head_metadata(prefix):
    # TODO this name is awful, fix it
    model = Model(
        "HeadDataProvider", attrs={"prefix": prefix}, forward=head_data_forward
    )
    return model
 def head_data_forward(model, docs, is_train):
    """A layer to generate the extra data needed for the span predictor."""
    sent_ids = []
    head_ids = []
    prefix = model.attrs["prefix"]
    for doc in docs:
        sids = model.ops.asarray2i(get_sentence_ids(doc))
        sent_ids.append(sids)
        heads = []
        for key, sg in doc.spans.items():
            if not key.startswith(prefix):
                continue
            for span in sg:
                # TODO warn if spans are more than one token
                heads.append(span[0].i)
        heads = model.ops.asarray2i(heads)
        head_ids.append(heads)
    # each of these is a list with one entry per doc
    # backprop is just a placeholder
    # TODO it would probably be better to have a list of tuples than two lists of arrays
    return (sent_ids, head_ids), lambda x: []
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@ -84,7 +84,6 @@ def make_coref(
    )
 class CoreferenceResolver(TrainablePipe):
    """Pipeline component for coreference resolution.
@ -318,7 +317,7 @@ class CoreferenceResolver(TrainablePipe):
            log_marg = ops.softmax(cscores + ops.xp.log(top_gscores), axis=1)
        log_norm = ops.softmax(cscores, axis=1)
        grad = log_norm - log_marg
-        #gradients.append((grad, cidx))
+        # gradients.append((grad, cidx))
        loss = float((grad**2).sum())
        return loss, grad
@ -373,5 +372,3 @@ class CoreferenceResolver(TrainablePipe):
            "coref_r": evaluator.get_recall(),
        }
        return score
--- a/spacy/pipeline/span_predictor.py
+++ b/spacy/pipeline/span_predictor.py
@ -46,26 +46,30 @@ depth = 2
 """
 DEFAULT_SPAN_PREDICTOR_MODEL = Config().from_str(default_span_predictor_config)["model"]
@Language.factory(
-        "span_predictor",
+    "span_predictor",
-        assigns=["doc.spans"],
+    assigns=["doc.spans"],
-        requires=["doc.spans"],
+    requires=["doc.spans"],
-        default_config={
+    default_config={
-            "model": DEFAULT_SPAN_PREDICTOR_MODEL,
+        "model": DEFAULT_SPAN_PREDICTOR_MODEL,
-            "input_prefix": "coref_head_clusters",
+        "input_prefix": "coref_head_clusters",
-            "output_prefix": "coref_clusters",
+        "output_prefix": "coref_clusters",
-            },
+    },
    default_score_weights={"span_accuracy": 1.0},
-    )
+)
 def make_span_predictor(
-        nlp: Language,
+    nlp: Language,
-        name: str,
+    name: str,
-        model,
+    model,
-        input_prefix: str = "coref_head_clusters",
+    input_prefix: str = "coref_head_clusters",
-        output_prefix: str = "coref_clusters",
+    output_prefix: str = "coref_clusters",
 ) -> "SpanPredictor":
    """Create a SpanPredictor component."""
-    return SpanPredictor(nlp.vocab, model, name, input_prefix=input_prefix, output_prefix=output_prefix)
+    return SpanPredictor(
        nlp.vocab, model, name, input_prefix=input_prefix, output_prefix=output_prefix
    )
 class SpanPredictor(TrainablePipe):
    """Pipeline component to resolve one-token spans to full spans.
@ -125,7 +129,7 @@ class SpanPredictor(TrainablePipe):
    def set_annotations(self, docs: Iterable[Doc], clusters_by_doc) -> None:
        for doc, clusters in zip(docs, clusters_by_doc):
            for ii, cluster in enumerate(clusters):
-                spans = [doc[mm[0]:mm[1]] for mm in cluster]
+                spans = [doc[mm[0] : mm[1]] for mm in cluster]
                doc.spans[f"{self.output_prefix}_{ii}"] = spans
    def update(
@ -218,10 +222,10 @@ class SpanPredictor(TrainablePipe):
            end_probs = ops.softmax(end_scores, axis=1)
            start_targets = to_categorical(starts, n_classes)
            end_targets = to_categorical(ends, n_classes)
-            start_grads = (start_probs - start_targets)
+            start_grads = start_probs - start_targets
-            end_grads = (end_probs - end_targets)
+            end_grads = end_probs - end_targets
            grads = ops.xp.stack((start_grads, end_grads), axis=2)
-            loss = float((grads ** 2).sum())
+            loss = float((grads**2).sum())
        return loss, grads
    def initialize(