From 2e8f0e9168fe8a05b3f40ac84995273d31691d37 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 16 May 2022 16:50:10 +0900
Subject: [PATCH] Rename coref params

---
 spacy/ml/models/coref.py          | 59 +++++++++++++++----------------
 spacy/pipeline/coref.py           |  9 +++--
 website/docs/api/architectures.md | 39 ++++++++++++--------
 3 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py
index cfbe83a7a..299abdc6b 100644
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@@ -14,14 +14,13 @@ from .coref_util import add_dummy
 @registry.architectures("spacy.Coref.v1")
 def build_wl_coref_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
-    embedding_size: int = 20,
+    distance_embedding_size: int = 20,
     hidden_size: int = 1024,
-    n_hidden_layers: int = 1,  # TODO rename to "depth"?
+    depth: int = 1,
     dropout: float = 0.3,
     # pairs to keep per mention after rough scoring
-    rough_k: int = 50,
-    # TODO is this not a training loop setting?
-    a_scoring_batch_size: int = 512,
+    antecedent_limit: int = 50,
+    antecedent_batch_size: int = 512,
 ):
     # TODO add model return types
     # TODO fix this
@@ -35,12 +34,12 @@ def build_wl_coref_model(
         coref_scorer = PyTorchWrapper(
             CorefScorer(
                 dim,
-                embedding_size,
+                distance_embedding_size,
                 hidden_size,
-                n_hidden_layers,
+                depth,
                 dropout,
-                rough_k,
-                a_scoring_batch_size,
+                antecedent_limit,
+                antecedent_batch_size,
             ),
             convert_inputs=convert_coref_scorer_inputs,
             convert_outputs=convert_coref_scorer_outputs,
@@ -99,7 +98,7 @@ class CorefScorer(torch.nn.Module):
         dist_emb_size: int,
         hidden_size: int,
         n_layers: int,
-        dropout_rate: float,
+        dropout: float,
         roughk: int,
         batch_size: int,
     ):
@@ -109,31 +108,31 @@ class CorefScorer(torch.nn.Module):
         dist_emb_size: Size of the distance embeddings.
         hidden_size: Size of the coreference candidate embeddings.
         n_layers: Numbers of layers in the AnaphoricityScorer.
-        dropout_rate: Dropout probability to apply across all modules.
+        dropout: Dropout probability to apply across all modules.
         roughk: Number of candidates the RoughScorer returns.
         batch_size: Internal batch-size for the more expensive scorer.
         """
-        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.dropout = torch.nn.Dropout(dropout)
         self.batch_size = batch_size
         # Modules
-        self.pw = DistancePairwiseEncoder(dist_emb_size, dropout_rate)
+        self.pw = DistancePairwiseEncoder(dist_emb_size, dropout)
         pair_emb = dim * 3 + self.pw.shape
         self.a_scorer = AnaphoricityScorer(
             pair_emb,
             hidden_size,
             n_layers,
-            dropout_rate
+            dropout
         )
         self.lstm = torch.nn.LSTM(
             input_size=dim,
             hidden_size=dim,
             batch_first=True,
         )
-        self.rough_scorer = RoughScorer(dim, dropout_rate, roughk)
-        self.pw = DistancePairwiseEncoder(dist_emb_size, dropout_rate)
+        self.rough_scorer = RoughScorer(dim, dropout, roughk)
+        self.pw = DistancePairwiseEncoder(dist_emb_size, dropout)
         pair_emb = dim * 3 + self.pw.shape
         self.a_scorer = AnaphoricityScorer(
-            pair_emb, hidden_size, n_layers, dropout_rate
+            pair_emb, hidden_size, n_layers, dropout
         )
 
     def forward(
@@ -190,18 +189,18 @@ class CorefScorer(torch.nn.Module):
 class AnaphoricityScorer(torch.nn.Module):
     """Calculates anaphoricity scores by passing the inputs into a FFNN"""
 
-    def __init__(self, in_features: int, hidden_size, n_hidden_layers, dropout_rate):
+    def __init__(self, in_features: int, hidden_size, depth, dropout):
         super().__init__()
         hidden_size = hidden_size
-        if not n_hidden_layers:
+        if not depth:
             hidden_size = in_features
         layers = []
-        for i in range(n_hidden_layers):
+        for i in range(depth):
             layers.extend(
                 [
                     torch.nn.Linear(hidden_size if i else in_features, hidden_size),
                     torch.nn.LeakyReLU(),
-                    torch.nn.Dropout(dropout_rate),
+                    torch.nn.Dropout(dropout),
                 ]
             )
         self.hidden = torch.nn.Sequential(*layers)
@@ -243,7 +242,7 @@ class AnaphoricityScorer(torch.nn.Module):
     def _ffnn(self, x: torch.Tensor) -> torch.Tensor:
         """
         x: tensor of shape (batch_size x roughk x n_features
-        returns: tensor of shape (batch_size x rough_k)
+        returns: tensor of shape (batch_size x antecedent_limit)
         """
         x = self.out(self.hidden(x))
         return x.squeeze(2)
@@ -289,11 +288,11 @@ class RoughScorer(torch.nn.Module):
     steps to reduce computational cost.
     """
 
-    def __init__(self, features: int, dropout_rate: float, rough_k: float):
+    def __init__(self, features: int, dropout: float, antecedent_limit: int):
         super().__init__()
-        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.dropout = torch.nn.Dropout(dropout)
         self.bilinear = torch.nn.Linear(features, features)
-        self.k = rough_k
+        self.k = antecedent_limit
 
     def forward(
         self,  # type: ignore  # pylint: disable=arguments-differ  #35566 in pytorch
@@ -317,7 +316,7 @@ class RoughScorer(torch.nn.Module):
 
 
 class DistancePairwiseEncoder(torch.nn.Module):
-    def __init__(self, embedding_size, dropout_rate):
+    def __init__(self, distance_embedding_size, dropout):
         """
         Takes the top_indices indicating, which is a ranked
         list for each word and its most likely corresponding
@@ -325,15 +324,15 @@ class DistancePairwiseEncoder(torch.nn.Module):
         up a distance embedding from a table, where the distance
         corresponds to the log-distance.
 
-        embedding_size: int,
+        distance_embedding_size: int,
             Dimensionality of the distance-embeddings table.
-        dropout_rate: float,
+        dropout: float,
             Dropout probability.
         """
         super().__init__()
-        emb_size = embedding_size
+        emb_size = distance_embedding_size
         self.distance_emb = torch.nn.Embedding(9, emb_size)
-        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.dropout = torch.nn.Dropout(dropout)
         self.shape = emb_size
 
     def forward(
diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py
index 5237788cc..c5bf8fbbe 100644
--- a/spacy/pipeline/coref.py
+++ b/spacy/pipeline/coref.py
@@ -31,13 +31,12 @@ from ..coref_scorer import Evaluator, get_cluster_info, lea
 default_config = """
 [model]
 @architectures = "spacy.Coref.v1"
-embedding_size = 20
+distance_embedding_size = 20
 hidden_size = 1024
-n_hidden_layers = 1
+depth = 1
 dropout = 0.3
-rough_k = 50
-a_scoring_batch_size = 512
-sp_embedding_size = 64
+antecedent_limit = 50
+antecedent_batch_size = 512
 
 [model.tok2vec]
 @architectures = "spacy.Tok2Vec.v2"
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index fab07af65..1a807928d 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -939,12 +939,12 @@ performance if working with only token-level clusters is acceptable.
 >
 > [model]
 > @architectures = "spacy.Coref.v1"
-> embedding_size = 20
+> distance_embedding_size = 20
 > dropout = 0.3
 > hidden_size = 1024
-> n_hidden_layers = 2
-> rough_k = 50
-> a_scoring_batch_size = 512
+> depth = 2
+> antecedent_limit = 50
+> antecedent_batch_size = 512
 >
 > [model.tok2vec]
 > @architectures = "spacy-transformers.TransformerListener.v1"
@@ -955,16 +955,16 @@ performance if working with only token-level clusters is acceptable.
 
 The `Coref` model architecture is a Thinc `Model`.
 
-| Name                   | Description                                                                                                                                                                              |
-| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`              | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                  |
-| `embedding_size`       | ~~int~~                                                                                                                                                                                  |
-| `dropout`              | The dropout to use internally. Unlike some Thinc models, this has separate dropout for the internal PyTorch layers. ~~float~~                                                            |
-| `hidden_size`          | Size of the main internal layers. ~~int~~                                                                                                                                                |
-| `n_hidden_layers`      | Depth of the internal network. ~~int~~                                                                                                                                                   |
-| `rough_k`              | How many candidate antecedents to keep after rough scoring. This has a significant effect on memory usage. Typical values would be 50 to 200, or higher for very long documents. ~~int~~ |
-| `a_scoring_batch_size` | Internal batch size. ~~int~~                                                                                                                                                             |
-| **CREATES**            | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                         |
+| Name                      | Description                                                                                                                                                                              |
+| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`                 | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                  |
+| `distance_embedding_size` | A representation of the distance between candidates. ~~int~~                                                                                                                             |
+| `dropout`                 | The dropout to use internally. Unlike some Thinc models, this has separate dropout for the internal PyTorch layers. ~~float~~                                                            |
+| `hidden_size`             | Size of the main internal layers. ~~int~~                                                                                                                                                |
+| `depth`                   | Depth of the internal network. ~~int~~                                                                                                                                                   |
+| `antecedent_limit`        | How many candidate antecedents to keep after rough scoring. This has a significant effect on memory usage. Typical values would be 50 to 200, or higher for very long documents. ~~int~~ |
+| `antecedent_batch_size`   | Internal batch size. ~~int~~                                                                                                                                                             |
+| **CREATES**               | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                         |
 
 ### spacy.SpanPredictor.v1 {#SpanPredictor}
 
@@ -985,3 +985,14 @@ The `Coref` model architecture is a Thinc `Model`.
 > ```
 
 The `SpanPredictor` model architecture is a Thinc `Model`.
+
+| Name                      | Description                                                                                                                                                                              |
+| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`                 | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                  |
+| `distance_embedding_size` | A representation of the distance between two candidates. ~~int~~                                                                                                                         |
+| `dropout`                 | The dropout to use internally. Unlike some Thinc models, this has separate dropout for the internal PyTorch layers. ~~float~~                                                            |
+| `hidden_size`             | Size of the main internal layers. ~~int~~                                                                                                                                                |
+| `depth`                   | Depth of the internal network. ~~int~~                                                                                                                                                   |
+| `antecedent_limit`        | How many candidate antecedents to keep after rough scoring. This has a significant effect on memory usage. Typical values would be 50 to 200, or higher for very long documents. ~~int~~ |
+| `antecedent_batch_size`   | Internal batch size. ~~int~~                                                                                                                                                             |
+| **CREATES**               | The model using the architecture. ~~Model[List[Doc], TupleFloats2d]~~                                                                                                                    |