From 6b51258a5848439e9eba663fec66f347825ca2a2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 9 May 2022 13:34:50 +0200 Subject: [PATCH] clean up unused imports + black formatting --- spacy/ml/models/coref.py | 256 ++++++++++++++++++--------------------- 1 file changed, 116 insertions(+), 140 deletions(-) diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py index 0b533daf0..835aeb1ce 100644 --- a/spacy/ml/models/coref.py +++ b/spacy/ml/models/coref.py @@ -1,27 +1,22 @@ -from dataclasses import dataclass -import warnings - -from thinc.api import Model, Linear, Relu, Dropout -from thinc.api import chain, noop, Embed, add, tuplify, concatenate -from thinc.api import reduce_first, reduce_last, reduce_mean -from thinc.api import PyTorchWrapper, ArgsKwargs -from thinc.types import Floats2d, Floats1d, Ints1d, Ints2d, Ragged -from typing import List, Callable, Tuple, Any -from ...tokens import Doc -from ...util import registry -from ..extract_spans import extract_spans - +from typing import List, Tuple import torch + +from thinc.api import Model, chain, tuplify +from thinc.api import PyTorchWrapper, ArgsKwargs +from thinc.types import Floats2d, Ints1d, Ints2d from thinc.util import xp2torch, torch2xp +from ...tokens import Doc +from ...util import registry from .coref_util import add_dummy, get_sentence_ids + @registry.architectures("spacy.Coref.v1") def build_wl_coref_model( tok2vec: Model[List[Doc], List[Floats2d]], embedding_size: int = 20, hidden_size: int = 1024, - n_hidden_layers: int = 1, # TODO rename to "depth"? + n_hidden_layers: int = 1, # TODO rename to "depth"? dropout: float = 0.3, # pairs to keep per mention after rough scoring # TODO change to meaningful name @@ -30,7 +25,7 @@ def build_wl_coref_model( a_scoring_batch_size: int = 512, # span predictor embeddings sp_embedding_size: int = 64, - ): +): # TODO fix this try: dim = tok2vec.get_dim("nO") @@ -48,10 +43,10 @@ def build_wl_coref_model( n_hidden_layers, dropout, rough_k, - a_scoring_batch_size + a_scoring_batch_size, ), convert_inputs=convert_coref_scorer_inputs, - convert_outputs=convert_coref_scorer_outputs + convert_outputs=convert_coref_scorer_outputs, ) coref_model = tok2vec >> coref_scorer # XXX just ignore this until the coref scorer is integrated @@ -68,12 +63,13 @@ def build_wl_coref_model( # and just return words as spans. return coref_model + @registry.architectures("spacy.SpanPredictor.v1") def build_span_predictor( tok2vec: Model[List[Doc], List[Floats2d]], hidden_size: int = 1024, dist_emb_size: int = 64, - ): +): # TODO fix this try: dim = tok2vec.get_dim("nO") @@ -84,22 +80,16 @@ def build_span_predictor( with Model.define_operators({">>": chain, "&": tuplify}): span_predictor = PyTorchWrapper( SpanPredictor(dim, hidden_size, dist_emb_size), - convert_inputs=convert_span_predictor_inputs + convert_inputs=convert_span_predictor_inputs, ) # TODO use proper parameter for prefix - head_info = build_get_head_metadata( - "coref_head_clusters" - ) + head_info = build_get_head_metadata("coref_head_clusters") model = (tok2vec & head_info) >> span_predictor return model -def convert_coref_scorer_inputs( - model: Model, - X: List[Floats2d], - is_train: bool -): +def convert_coref_scorer_inputs(model: Model, X: List[Floats2d], is_train: bool): # The input here is List[Floats2d], one for each doc # just use the first # TODO real batching @@ -111,14 +101,10 @@ def convert_coref_scorer_inputs( gradients = torch2xp(args.args[0]) return [gradients] - return ArgsKwargs(args=(word_features, ), kwargs={}), backprop + return ArgsKwargs(args=(word_features,), kwargs={}), backprop -def convert_coref_scorer_outputs( - model: Model, - inputs_outputs, - is_train: bool -): +def convert_coref_scorer_outputs(model: Model, inputs_outputs, is_train: bool): _, outputs = inputs_outputs scores, indices = outputs @@ -135,9 +121,7 @@ def convert_coref_scorer_outputs( def convert_span_predictor_inputs( - model: Model, - X: Tuple[Ints1d, Floats2d, Ints1d], - is_train: bool + model: Model, X: Tuple[Ints1d, Floats2d, Ints1d], is_train: bool ): tok2vec, (sent_ids, head_ids) = X # Normally we shoudl use the input is_train, but for these two it's not relevant @@ -160,10 +144,9 @@ def convert_span_predictor_inputs( # TODO This probably belongs in the component, not the model. -def predict_span_clusters(span_predictor: Model, - sent_ids: Ints1d, - words: Floats2d, - clusters: List[Ints1d]): +def predict_span_clusters( + span_predictor: Model, sent_ids: Ints1d, words: Floats2d, clusters: List[Ints1d] +): """ Predicts span clusters based on the word clusters. @@ -187,20 +170,15 @@ def predict_span_clusters(span_predictor: Model, ends = (scores[:, :, 1].argmax(axis=1) + 1).tolist() head2span = { - head: (start, end) - for head, start, end in zip(heads_ids.tolist(), starts, ends) + head: (start, end) for head, start, end in zip(heads_ids.tolist(), starts, ends) } - return [[head2span[head] for head in cluster] - for cluster in clusters] + return [[head2span[head] for head in cluster] for cluster in clusters] + # TODO add docstring for this, maybe move to utils. # This might belong in the component. -def _clusterize( - model, - scores: Floats2d, - top_indices: Ints2d -): +def _clusterize(model, scores: Floats2d, top_indices: Ints2d): xp = model.ops.xp antecedents = scores.argmax(axis=1) - 1 not_dummy = antecedents >= 0 @@ -229,15 +207,14 @@ def _clusterize( def build_get_head_metadata(prefix): # TODO this name is awful, fix it - model = Model("HeadDataProvider", - attrs={'prefix': prefix}, - forward=head_data_forward) + model = Model( + "HeadDataProvider", attrs={"prefix": prefix}, forward=head_data_forward + ) return model def head_data_forward(model, docs, is_train): - """A layer to generate the extra data needed for the span predictor. - """ + """A layer to generate the extra data needed for the span predictor.""" sent_ids = [] head_ids = [] prefix = model.attrs["prefix"] @@ -271,15 +248,16 @@ class CorefScorer(torch.nn.Module): a_scorer (AnaphoricityScorer) sp (SpanPredictor) """ + def __init__( self, - dim: int, # tok2vec size + dim: int, # tok2vec size dist_emb_size: int, hidden_size: int, n_layers: int, dropout_rate: float, roughk: int, - batch_size: int + batch_size: int, ): super().__init__() """ @@ -290,14 +268,11 @@ class CorefScorer(torch.nn.Module): (useful for warm start) """ self.pw = DistancePairwiseEncoder(dist_emb_size, dropout_rate) - #TODO clean this up + # TODO clean this up bert_emb = dim pair_emb = bert_emb * 3 + self.pw.shape self.a_scorer = AnaphoricityScorer( - pair_emb, - hidden_size, - n_layers, - dropout_rate + pair_emb, hidden_size, n_layers, dropout_rate ) self.lstm = torch.nn.LSTM( input_size=bert_emb, @@ -305,17 +280,10 @@ class CorefScorer(torch.nn.Module): batch_first=True, ) self.dropout = torch.nn.Dropout(dropout_rate) - self.rough_scorer = RoughScorer( - bert_emb, - dropout_rate, - roughk - ) + self.rough_scorer = RoughScorer(bert_emb, dropout_rate, roughk) self.batch_size = batch_size - def forward( - self, - word_features: torch.Tensor -) -> Tuple[torch.Tensor, torch.Tensor]: + def forward(self, word_features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ This is a massive method, but it made sense to me to not split it into several ones to let one see the data flow. @@ -327,7 +295,7 @@ class CorefScorer(torch.nn.Module): """ # words [n_words, span_emb] # cluster_ids [n_words] - self.lstm.flatten_parameters() # XXX without this there's a warning + self.lstm.flatten_parameters() # XXX without this there's a warning word_features = torch.unsqueeze(word_features, dim=0) words, _ = self.lstm(word_features) words = words.squeeze() @@ -342,16 +310,18 @@ class CorefScorer(torch.nn.Module): a_scores_lst: List[torch.Tensor] = [] for i in range(0, len(words), batch_size): - pw_batch = pw[i:i + batch_size] - words_batch = words[i:i + batch_size] - top_indices_batch = top_indices[i:i + batch_size] - top_rough_scores_batch = top_rough_scores[i:i + batch_size] + pw_batch = pw[i : i + batch_size] + words_batch = words[i : i + batch_size] + top_indices_batch = top_indices[i : i + batch_size] + top_rough_scores_batch = top_rough_scores[i : i + batch_size] # a_scores_batch [batch_size, n_ants] a_scores_batch = self.a_scorer( - all_mentions=words, mentions_batch=words_batch, - pw_batch=pw_batch, top_indices_batch=top_indices_batch, - top_rough_scores_batch=top_rough_scores_batch + all_mentions=words, + mentions_batch=words_batch, + pw_batch=pw_batch, + top_indices_batch=top_indices_batch, + top_rough_scores_batch=top_rough_scores_batch, ) a_scores_lst.append(a_scores_batch) @@ -360,33 +330,35 @@ class CorefScorer(torch.nn.Module): class AnaphoricityScorer(torch.nn.Module): - """ Calculates anaphoricity scores by passing the inputs into a FFNN """ - def __init__(self, - in_features: int, - hidden_size, - n_hidden_layers, - dropout_rate): + """Calculates anaphoricity scores by passing the inputs into a FFNN""" + + def __init__(self, in_features: int, hidden_size, n_hidden_layers, dropout_rate): super().__init__() hidden_size = hidden_size if not n_hidden_layers: hidden_size = in_features layers = [] for i in range(n_hidden_layers): - layers.extend([torch.nn.Linear(hidden_size if i else in_features, - hidden_size), - torch.nn.LeakyReLU(), - torch.nn.Dropout(dropout_rate)]) + layers.extend( + [ + torch.nn.Linear(hidden_size if i else in_features, hidden_size), + torch.nn.LeakyReLU(), + torch.nn.Dropout(dropout_rate), + ] + ) self.hidden = torch.nn.Sequential(*layers) self.out = torch.nn.Linear(hidden_size, out_features=1) - def forward(self, *, # type: ignore # pylint: disable=arguments-differ #35566 in pytorch - all_mentions: torch.Tensor, - mentions_batch: torch.Tensor, - pw_batch: torch.Tensor, - top_indices_batch: torch.Tensor, - top_rough_scores_batch: torch.Tensor, - ) -> torch.Tensor: - """ Builds a pairwise matrix, scores the pairs and returns the scores. + def forward( + self, + *, # type: ignore # pylint: disable=arguments-differ #35566 in pytorch + all_mentions: torch.Tensor, + mentions_batch: torch.Tensor, + pw_batch: torch.Tensor, + top_indices_batch: torch.Tensor, + top_rough_scores_batch: torch.Tensor, + ) -> torch.Tensor: + """Builds a pairwise matrix, scores the pairs and returns the scores. Args: all_mentions (torch.Tensor): [n_mentions, mention_emb] @@ -401,7 +373,8 @@ class AnaphoricityScorer(torch.nn.Module): """ # [batch_size, n_ants, pair_emb] pair_matrix = self._get_pair_matrix( - all_mentions, mentions_batch, pw_batch, top_indices_batch) + all_mentions, mentions_batch, pw_batch, top_indices_batch + ) # [batch_size, n_ants] scores = top_rough_scores_batch + self._ffnn(pair_matrix) @@ -423,11 +396,12 @@ class AnaphoricityScorer(torch.nn.Module): return x.squeeze(2) @staticmethod - def _get_pair_matrix(all_mentions: torch.Tensor, - mentions_batch: torch.Tensor, - pw_batch: torch.Tensor, - top_indices_batch: torch.Tensor, - ) -> torch.Tensor: + def _get_pair_matrix( + all_mentions: torch.Tensor, + mentions_batch: torch.Tensor, + pw_batch: torch.Tensor, + top_indices_batch: torch.Tensor, + ) -> torch.Tensor: """ Builds the matrix used as input for AnaphoricityScorer. @@ -464,12 +438,8 @@ class RoughScorer(torch.nn.Module): only top scoring candidates are considered on later steps to reduce computational complexity. """ - def __init__( - self, - features: int, - dropout_rate: float, - rough_k: float - ): + + def __init__(self, features: int, dropout_rate: float, rough_k: float): super().__init__() self.dropout = torch.nn.Dropout(dropout_rate) self.bilinear = torch.nn.Linear(features, features) @@ -478,7 +448,7 @@ class RoughScorer(torch.nn.Module): def forward( self, # type: ignore # pylint: disable=arguments-differ #35566 in pytorch - mentions: torch.Tensor + mentions: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor]: """ Returns rough anaphoricity scores for candidates, which consist of @@ -493,9 +463,7 @@ class RoughScorer(torch.nn.Module): return self._prune(rough_scores) - def _prune(self, - rough_scores: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: + def _prune(self, rough_scores: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Selects top-k rough antecedent scores for each mention. @@ -507,9 +475,9 @@ class RoughScorer(torch.nn.Module): FloatTensor of shape [n_mentions, k], top rough scores LongTensor of shape [n_mentions, k], top indices """ - top_scores, indices = torch.topk(rough_scores, - k=min(self.k, len(rough_scores)), - dim=1, sorted=False) + top_scores, indices = torch.topk( + rough_scores, k=min(self.k, len(rough_scores)), dim=1, sorted=False + ) return top_scores, indices @@ -523,7 +491,7 @@ class SpanPredictor(torch.nn.Module): torch.nn.Linear(input_size * 2 + dist_emb_size, hidden_size), torch.nn.ReLU(), torch.nn.Dropout(0.3), - #TODO seems weird the 256 isn't a parameter??? + # TODO seems weird the 256 isn't a parameter??? torch.nn.Linear(hidden_size, 256), torch.nn.ReLU(), torch.nn.Dropout(0.3), @@ -531,15 +499,16 @@ class SpanPredictor(torch.nn.Module): torch.nn.Linear(256, dist_emb_size), ) self.conv = torch.nn.Sequential( - torch.nn.Conv1d(64, 4, 3, 1, 1), - torch.nn.Conv1d(4, 2, 3, 1, 1) + torch.nn.Conv1d(64, 4, 3, 1, 1), torch.nn.Conv1d(4, 2, 3, 1, 1) ) - self.emb = torch.nn.Embedding(128, dist_emb_size) # [-63, 63] + too_far + self.emb = torch.nn.Embedding(128, dist_emb_size) # [-63, 63] + too_far - def forward(self, # type: ignore # pylint: disable=arguments-differ #35566 in pytorch - sent_id, - words: torch.Tensor, - heads_ids: torch.Tensor) -> torch.Tensor: + def forward( + self, # type: ignore # pylint: disable=arguments-differ #35566 in pytorch + sent_id, + words: torch.Tensor, + heads_ids: torch.Tensor, + ) -> torch.Tensor: """ Calculates span start/end scores of words for each span head in heads_ids @@ -557,37 +526,44 @@ class SpanPredictor(torch.nn.Module): if heads_ids.nelement() == 0: return torch.empty(size=(0,)) # Obtain distance embedding indices, [n_heads, n_words] - relative_positions = (heads_ids.unsqueeze(1) - torch.arange(words.shape[0]).unsqueeze(0)) + relative_positions = heads_ids.unsqueeze(1) - torch.arange( + words.shape[0] + ).unsqueeze(0) # make all valid distances positive emb_ids = relative_positions + 63 # "too_far" emb_ids[(emb_ids < 0) + (emb_ids > 126)] = 127 # Obtain "same sentence" boolean mask, [n_heads, n_words] heads_ids = heads_ids.long() - same_sent = (sent_id[heads_ids].unsqueeze(1) == sent_id.unsqueeze(0)) + same_sent = sent_id[heads_ids].unsqueeze(1) == sent_id.unsqueeze(0) # To save memory, only pass candidates from one sentence for each head # pair_matrix contains concatenated span_head_emb + candidate_emb + distance_emb # for each candidate among the words in the same sentence as span_head # [n_heads, input_size * 2 + distance_emb_size] rows, cols = same_sent.nonzero(as_tuple=True) - pair_matrix = torch.cat(( - words[heads_ids[rows]], - words[cols], - self.emb(emb_ids[rows, cols]), - ), dim=1) + pair_matrix = torch.cat( + ( + words[heads_ids[rows]], + words[cols], + self.emb(emb_ids[rows, cols]), + ), + dim=1, + ) lengths = same_sent.sum(dim=1) padding_mask = torch.arange(0, lengths.max().item()).unsqueeze(0) - padding_mask = (padding_mask < lengths.unsqueeze(1)) # [n_heads, max_sent_len] + padding_mask = padding_mask < lengths.unsqueeze(1) # [n_heads, max_sent_len] # [n_heads, max_sent_len, input_size * 2 + distance_emb_size] # This is necessary to allow the convolution layer to look at several # word scores padded_pairs = torch.zeros(*padding_mask.shape, pair_matrix.shape[-1]) padded_pairs[padding_mask] = pair_matrix - res = self.ffnn(padded_pairs) # [n_heads, n_candidates, last_layer_output] - res = self.conv(res.permute(0, 2, 1)).permute(0, 2, 1) # [n_heads, n_candidates, 2] + res = self.ffnn(padded_pairs) # [n_heads, n_candidates, last_layer_output] + res = self.conv(res.permute(0, 2, 1)).permute( + 0, 2, 1 + ) # [n_heads, n_candidates, 2] - scores = torch.full((heads_ids.shape[0], words.shape[0], 2), float('-inf')) + scores = torch.full((heads_ids.shape[0], words.shape[0], 2), float("-inf")) scores[rows, cols] = res[padding_mask] # Make sure that start <= head <= end during inference if not self.training: @@ -597,8 +573,8 @@ class SpanPredictor(torch.nn.Module): return scores + valid_positions return scores -class DistancePairwiseEncoder(torch.nn.Module): +class DistancePairwiseEncoder(torch.nn.Module): def __init__(self, embedding_size, dropout_rate): super().__init__() emb_size = embedding_size @@ -606,12 +582,12 @@ class DistancePairwiseEncoder(torch.nn.Module): self.dropout = torch.nn.Dropout(dropout_rate) self.shape = emb_size - def forward(self, # type: ignore # pylint: disable=arguments-differ #35566 in pytorch - top_indices: torch.Tensor - ) -> torch.Tensor: + def forward( + self, # type: ignore # pylint: disable=arguments-differ #35566 in pytorch + top_indices: torch.Tensor, + ) -> torch.Tensor: word_ids = torch.arange(0, top_indices.size(0)) - distance = (word_ids.unsqueeze(1) - word_ids[top_indices] - ).clamp_min_(min=1) + distance = (word_ids.unsqueeze(1) - word_ids[top_indices]).clamp_min_(min=1) log_distance = distance.to(torch.float).log2().floor_() log_distance = log_distance.clamp_max_(max=6).to(torch.long) distance = torch.where(distance < 5, distance - 1, log_distance + 2)