From db422abf011fb9b0dabde5e22b9d7fa0b05424b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Fri, 18 Mar 2022 16:24:26 +0100 Subject: [PATCH 01/18] remove unnecessary .device --- spacy/ml/models/coref.py | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py index f40a4c110..fea4bc21a 100644 --- a/spacy/ml/models/coref.py +++ b/spacy/ml/models/coref.py @@ -38,14 +38,11 @@ def build_wl_coref_model( except ValueError: # happens with transformer listener dim = 768 - + with Model.define_operators({">>": chain}): # TODO chain tok2vec with these models - # TODO fix device - should be automatic - device = "cuda:0" coref_scorer = PyTorchWrapper( CorefScorer( - device, dim, embedding_size, hidden_size, @@ -65,7 +62,6 @@ def build_wl_coref_model( # TODO this was hardcoded to 1024, check hidden_size, sp_embedding_size, - device ), convert_inputs=convert_span_predictor_inputs @@ -205,7 +201,6 @@ class CorefScorer(torch.nn.Module): """ def __init__( self, - device: str, dim: int, # tok2vec size dist_emb_size: int, hidden_size: int, @@ -222,8 +217,7 @@ class CorefScorer(torch.nn.Module): epochs_trained (int): the number of epochs finished (useful for warm start) """ - # device, dist_emb_size, hidden_size, n_layers, dropout_rate - self.pw = DistancePairwiseEncoder(dist_emb_size, dropout_rate).to(device) + self.pw = DistancePairwiseEncoder(dist_emb_size, dropout_rate) #TODO clean this up bert_emb = dim pair_emb = bert_emb * 3 + self.pw.shape @@ -232,7 +226,7 @@ class CorefScorer(torch.nn.Module): hidden_size, n_layers, dropout_rate - ).to(device) + ) self.lstm = torch.nn.LSTM( input_size=bert_emb, hidden_size=bert_emb, @@ -243,7 +237,7 @@ class CorefScorer(torch.nn.Module): bert_emb, dropout_rate, roughk - ).to(device) + ) self.batch_size = batch_size def forward( @@ -392,7 +386,6 @@ class AnaphoricityScorer(torch.nn.Module): return out - class RoughScorer(torch.nn.Module): """ Is needed to give a roughly estimate of the anaphoricity of two candidates, @@ -423,7 +416,6 @@ class RoughScorer(torch.nn.Module): pair_mask = torch.arange(mentions.shape[0]) pair_mask = pair_mask.unsqueeze(1) - pair_mask.unsqueeze(0) pair_mask = torch.log((pair_mask > 0).to(torch.float)) - pair_mask = pair_mask.to(mentions.device) bilinear_scores = self.dropout(self.bilinear(mentions)).mm(mentions.T) rough_scores = pair_mask + bilinear_scores @@ -450,7 +442,7 @@ class RoughScorer(torch.nn.Module): class SpanPredictor(torch.nn.Module): - def __init__(self, input_size: int, distance_emb_size: int, device): + def __init__(self, input_size: int, distance_emb_size: int): super().__init__() self.ffnn = torch.nn.Sequential( torch.nn.Linear(input_size * 2 + 64, input_size), @@ -461,7 +453,6 @@ class SpanPredictor(torch.nn.Module): torch.nn.Dropout(0.3), torch.nn.Linear(256, 64), ) - self.device = device self.conv = torch.nn.Sequential( torch.nn.Conv1d(64, 4, 3, 1, 1), torch.nn.Conv1d(4, 2, 3, 1, 1) @@ -529,6 +520,8 @@ class SpanPredictor(torch.nn.Module): valid_positions = torch.stack((valid_starts, valid_ends), dim=2) return scores + valid_positions return scores + + class DistancePairwiseEncoder(torch.nn.Module): def __init__(self, embedding_size, dropout_rate): @@ -538,17 +531,10 @@ class DistancePairwiseEncoder(torch.nn.Module): self.dropout = torch.nn.Dropout(dropout_rate) self.shape = emb_size - @property - def device(self) -> torch.device: - """ A workaround to get current device (which is assumed to be the - device of the first parameter of one of the submodules) """ - return next(self.distance_emb.parameters()).device - - def forward(self, # type: ignore # pylint: disable=arguments-differ #35566 in pytorch top_indices: torch.Tensor ) -> torch.Tensor: - word_ids = torch.arange(0, top_indices.size(0), device=self.device) + word_ids = torch.arange(0, top_indices.size(0)) distance = (word_ids.unsqueeze(1) - word_ids[top_indices] ).clamp_min_(min=1) log_distance = distance.to(torch.float).log2().floor_() From 1eaf8fb0cf01dec6d6a01f20e109eb21fd5f530d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Wed, 23 Mar 2022 11:24:27 +0100 Subject: [PATCH 02/18] span predictor debug start --- spacy/ml/models/coref.py | 9 ++++----- spacy/pipeline/coref.py | 14 +++++++------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py index 382d7a98b..29f3ad819 100644 --- a/spacy/ml/models/coref.py +++ b/spacy/ml/models/coref.py @@ -91,7 +91,7 @@ def build_span_predictor( # TODO fix device - should be automatic device = "cuda:0" span_predictor = PyTorchWrapper( - SpanPredictor(hidden_size, dist_emb_size, device), + SpanPredictor(dim, dist_emb_size, device), convert_inputs=convert_span_predictor_inputs ) # TODO use proper parameter for prefix @@ -148,7 +148,6 @@ def convert_span_predictor_inputs( # Normally we shoudl use the input is_train, but for these two it's not relevant sent_ids = xp2torch(sent_ids[0], requires_grad=False) head_ids = xp2torch(head_ids[0], requires_grad=False) - word_features = xp2torch(tok2vec[0], requires_grad=is_train) argskwargs = ArgsKwargs(args=(sent_ids, word_features, head_ids), kwargs={}) @@ -557,7 +556,6 @@ class SpanPredictor(torch.nn.Module): sent_id = torch.tensor(sent_id, device=words.device) heads_ids = heads_ids.long() same_sent = (sent_id[heads_ids].unsqueeze(1) == sent_id.unsqueeze(0)) - # To save memory, only pass candidates from one sentence for each head # pair_matrix contains concatenated span_head_emb + candidate_emb + distance_emb # for each candidate among the words in the same sentence as span_head @@ -568,11 +566,11 @@ class SpanPredictor(torch.nn.Module): words[cols], self.emb(emb_ids[rows, cols]), ), dim=1) - + input(len(heads_ids)) lengths = same_sent.sum(dim=1) padding_mask = torch.arange(0, lengths.max().item(), device=words.device).unsqueeze(0) padding_mask = (padding_mask < lengths.unsqueeze(1)) # [n_heads, max_sent_len] - + input(padding_mask.shape) # [n_heads, max_sent_len, input_size * 2 + distance_emb_size] # This is necessary to allow the convolution layer to look at several # word scores @@ -592,6 +590,7 @@ class SpanPredictor(torch.nn.Module): valid_positions = torch.stack((valid_starts, valid_ends), dim=2) return scores + valid_positions return scores + class DistancePairwiseEncoder(torch.nn.Module): def __init__(self, embedding_size, dropout_rate): diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 54e9d8cfd..b3ced454c 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -3,7 +3,7 @@ import warnings from thinc.types import Floats2d, Floats3d, Ints2d from thinc.api import Model, Config, Optimizer, CategoricalCrossentropy -from thinc.api import set_dropout_rate +from thinc.api import set_dropout_rate, to_categorical from itertools import islice from statistics import mean @@ -513,10 +513,8 @@ class SpanPredictor(TrainablePipe): total_loss = 0 for eg in examples: - preds, backprop = self.model.begin_update([eg.predicted]) - score_matrix, mention_idx = preds - - loss, d_scores = self.get_loss([eg], score_matrix, mention_idx) + span_scores, backprop = self.model.begin_update([eg.predicted]) + loss, d_scores = self.get_loss([eg], span_scores) total_loss += loss # TODO check shape here backprop((d_scores, mention_idx)) @@ -573,8 +571,10 @@ class SpanPredictor(TrainablePipe): for cluster in gold: for mention in cluster: starts.append(mention[0]) - ends.append(mention[1]) - + # XXX I think this was missing here + ends.append(mention[1] - 1) + starts = self.model.ops.xp.asarray(starts) + ends = self.model.ops.xp.asarray(ends) start_scores = span_scores[:, :, 0] end_scores = span_scores[:, :, 1] n_classes = start_scores.shape[1] From 706b2e6f25cc98e4be47adf5c0b8b968158019cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Thu, 24 Mar 2022 16:06:20 +0100 Subject: [PATCH 03/18] gearing up SpanPredictor for gold-heads --- spacy/ml/models/coref.py | 34 ++++++++++++++++++++-------------- spacy/pipeline/coref.py | 20 +++++++++++++++----- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py index 179de7e58..3350a8dd9 100644 --- a/spacy/ml/models/coref.py +++ b/spacy/ml/models/coref.py @@ -53,7 +53,6 @@ def build_wl_coref_model( convert_inputs=convert_coref_scorer_inputs, convert_outputs=convert_coref_scorer_outputs ) - coref_model = tok2vec >> coref_scorer # XXX just ignore this until the coref scorer is integrated span_predictor = PyTorchWrapper( @@ -62,7 +61,6 @@ def build_wl_coref_model( hidden_size, sp_embedding_size, ), - convert_inputs=convert_span_predictor_inputs ) # TODO combine models so output is uniform (just one forward pass) @@ -84,14 +82,15 @@ def build_span_predictor( dim = 768 with Model.define_operators({">>": chain, "&": tuplify}): - # TODO fix device - should be automatic - device = "cuda:0" span_predictor = PyTorchWrapper( - SpanPredictor(dim, dist_emb_size, device), + SpanPredictor(dim, dist_emb_size), convert_inputs=convert_span_predictor_inputs ) # TODO use proper parameter for prefix - head_info = build_get_head_metadata("coref_head_clusters") + head_info = build_get_head_metadata( + "span_coref_head_clusters", + "coref_head_clusters" + ) model = (tok2vec & head_info) >> span_predictor return model @@ -148,7 +147,7 @@ def convert_span_predictor_inputs( argskwargs = ArgsKwargs(args=(sent_ids, word_features, head_ids), kwargs={}) # TODO actually support backprop - return argskwargs, lambda dX: [] + return argskwargs, lambda dX: [[]] # TODO This probably belongs in the component, not the model. def predict_span_clusters(span_predictor: Model, @@ -217,18 +216,27 @@ def _clusterize( clusters.append(sorted(cluster)) return sorted(clusters) -def build_get_head_metadata(prefix): + +def build_get_head_metadata(update_prefix, predict_prefix): # TODO this name is awful, fix it - model = Model("HeadDataProvider", attrs={"prefix": prefix}, forward=head_data_forward) + model = Model("HeadDataProvider", + attrs={ + "update_prefix": update_prefix, + "predict_prefix": predict_prefix + }, + forward=head_data_forward) return model + def head_data_forward(model, docs, is_train): """A layer to generate the extra data needed for the span predictor. """ sent_ids = [] head_ids = [] - prefix = model.attrs["prefix"] - + if is_train: + prefix = model.attrs["update_prefix"] + else: + prefix = model.attrs["predict_prefix"] for doc in docs: sids = model.ops.asarray2i(get_sentence_ids(doc)) sent_ids.append(sids) @@ -241,7 +249,7 @@ def head_data_forward(model, docs, is_train): heads.append(span[0].i) heads = model.ops.asarray2i(heads) head_ids.append(heads) - + # each of these is a list with one entry per doc # backprop is just a placeholder # TODO it would probably be better to have a list of tuples than two lists of arrays @@ -557,11 +565,9 @@ class SpanPredictor(torch.nn.Module): words[cols], self.emb(emb_ids[rows, cols]), ), dim=1) - input(len(heads_ids)) lengths = same_sent.sum(dim=1) padding_mask = torch.arange(0, lengths.max().item(), device=words.device).unsqueeze(0) padding_mask = (padding_mask < lengths.unsqueeze(1)) # [n_heads, max_sent_len] - input(padding_mask.shape) # [n_heads, max_sent_len, input_size * 2 + distance_emb_size] # This is necessary to allow the convolution layer to look at several # word scores diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index b3ced454c..f37f777fc 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -417,6 +417,7 @@ DEFAULT_SPAN_PREDICTOR_MODEL = Config().from_str(default_span_predictor_config)[ default_config={ "model": DEFAULT_SPAN_PREDICTOR_MODEL, "input_prefix": "coref_head_clusters", + "target_prefix": "span_head_target_clusters", "output_prefix": "coref_clusters", }, default_score_weights={"span_predictor_f": 1.0, "span_predictor_p": None, "span_predictor_r": None}, @@ -426,6 +427,7 @@ def make_span_predictor( name: str, model, input_prefix: str = "coref_head_clusters", + target_prefix: str = "span_head_target_clusters", output_prefix: str = "coref_clusters", ) -> "SpanPredictor": """Create a SpanPredictor component.""" @@ -444,12 +446,14 @@ class SpanPredictor(TrainablePipe): name: str = "span_predictor", *, input_prefix: str = "coref_head_clusters", + target_prefix: str = "span_coref_head_clusters", output_prefix: str = "coref_clusters", ) -> None: self.vocab = vocab self.model = model self.name = name self.input_prefix = input_prefix + self.target_prefix = target_prefix self.output_prefix = output_prefix self.cfg = {} @@ -511,13 +515,18 @@ class SpanPredictor(TrainablePipe): set_dropout_rate(self.model, drop) total_loss = 0 - - for eg in examples: - span_scores, backprop = self.model.begin_update([eg.predicted]) + docs = [eg.predicted for eg in examples] + for doc, eg in zip(docs, examples): + # replicates the EntityLinker's behaviour and + # copies annotations over https://bit.ly/3iweDcW + for key, sg in eg.reference.spans.items(): + if key.startswith(self.target_prefix): + doc.spans[key] = [doc[span.start:span.end] for span in sg] + span_scores, backprop = self.model.begin_update([doc]) loss, d_scores = self.get_loss([eg], span_scores) total_loss += loss # TODO check shape here - backprop((d_scores, mention_idx)) + backprop(d_scores) if sgd is not None: self.finish_update(sgd) @@ -564,7 +573,7 @@ class SpanPredictor(TrainablePipe): for eg in examples: # get gold data - gold = doc2clusters(eg.reference, self.output_prefix) + gold = doc2clusters(eg.predicted, self.target_prefix) # flatten the gold data starts = [] ends = [] @@ -605,6 +614,7 @@ class SpanPredictor(TrainablePipe): doc = ex.predicted assert len(doc) > 2, "Coreference requires at least two tokens" doc.spans[f"{self.input_prefix}_0"] = [doc[0:1], doc[1:2]] + doc.spans[f"{self.target_prefix}_0"] = [doc[0:1], doc[1:2]] X.append(ex.predicted) Y.append(ex.reference) From 1c5dabcb47f89635a9a5c529f48abaac694fcf4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Thu, 24 Mar 2022 16:23:12 +0100 Subject: [PATCH 04/18] merge SpanPredictor attributes --- spacy/ml/models/coref.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py index 5fbb64a29..5fe29c25f 100644 --- a/spacy/ml/models/coref.py +++ b/spacy/ml/models/coref.py @@ -83,7 +83,7 @@ def build_span_predictor( with Model.define_operators({">>": chain, "&": tuplify}): span_predictor = PyTorchWrapper( - SpanPredictor(dim, dist_emb_size), + SpanPredictor(dim, hidden_size, dist_emb_size), convert_inputs=convert_span_predictor_inputs ) # TODO use proper parameter for prefix @@ -511,11 +511,7 @@ class RoughScorer(torch.nn.Module): class SpanPredictor(torch.nn.Module): -<<<<<<< HEAD - def __init__(self, input_size: int, distance_emb_size: int): -======= def __init__(self, input_size: int, hidden_size: int, dist_emb_size: int, device): ->>>>>>> eec00ce60d83f500e18f2da7d9feafa7143440f2 super().__init__() # input size = single token size # 64 = probably distance emb size From 83ac0477c8e73b3676a8614368f430d3e9ae6fa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Thu, 24 Mar 2022 16:44:50 +0100 Subject: [PATCH 05/18] remove useless extra prefix and device from spanpredictor --- spacy/ml/models/coref.py | 41 +++++++++++++++++----------------------- spacy/pipeline/coref.py | 10 +++------- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py index 5fe29c25f..71082e7ac 100644 --- a/spacy/ml/models/coref.py +++ b/spacy/ml/models/coref.py @@ -55,14 +55,14 @@ def build_wl_coref_model( ) coref_model = tok2vec >> coref_scorer # XXX just ignore this until the coref scorer is integrated - span_predictor = PyTorchWrapper( - SpanPredictor( - # TODO this was hardcoded to 1024, check - hidden_size, - sp_embedding_size, - ), - convert_inputs=convert_span_predictor_inputs - ) + # span_predictor = PyTorchWrapper( + # SpanPredictor( + # TODO this was hardcoded to 1024, check + # hidden_size, + # sp_embedding_size, + # ), + # convert_inputs=convert_span_predictor_inputs + # ) # TODO combine models so output is uniform (just one forward pass) # It may be reasonable to have an option to disable span prediction, # and just return words as spans. @@ -88,7 +88,6 @@ def build_span_predictor( ) # TODO use proper parameter for prefix head_info = build_get_head_metadata( - "span_coref_head_clusters", "coref_head_clusters" ) model = (tok2vec & head_info) >> span_predictor @@ -217,13 +216,10 @@ def _clusterize( return sorted(clusters) -def build_get_head_metadata(update_prefix, predict_prefix): +def build_get_head_metadata(prefix): # TODO this name is awful, fix it model = Model("HeadDataProvider", - attrs={ - "update_prefix": update_prefix, - "predict_prefix": predict_prefix - }, + attrs={'prefix': prefix}, forward=head_data_forward) return model @@ -233,10 +229,7 @@ def head_data_forward(model, docs, is_train): """ sent_ids = [] head_ids = [] - if is_train: - prefix = model.attrs["update_prefix"] - else: - prefix = model.attrs["predict_prefix"] + prefix = model.attrs["prefix"] for doc in docs: sids = model.ops.asarray2i(get_sentence_ids(doc)) sent_ids.append(sids) @@ -511,7 +504,7 @@ class RoughScorer(torch.nn.Module): class SpanPredictor(torch.nn.Module): - def __init__(self, input_size: int, hidden_size: int, dist_emb_size: int, device): + def __init__(self, input_size: int, hidden_size: int, dist_emb_size: int): super().__init__() # input size = single token size # 64 = probably distance emb size @@ -551,13 +544,13 @@ class SpanPredictor(torch.nn.Module): torch.Tensor: span start/end scores, [n_heads, n_words, 2] """ # Obtain distance embedding indices, [n_heads, n_words] - relative_positions = (heads_ids.unsqueeze(1) - torch.arange(words.shape[0], device=words.device).unsqueeze(0)) + relative_positions = (heads_ids.unsqueeze(1) - torch.arange(words.shape[0]).unsqueeze(0)) # make all valid distances positive emb_ids = relative_positions + 63 # "too_far" emb_ids[(emb_ids < 0) + (emb_ids > 126)] = 127 # Obtain "same sentence" boolean mask, [n_heads, n_words] - sent_id = torch.tensor(sent_id, device=words.device) + sent_id = torch.tensor(sent_id) heads_ids = heads_ids.long() same_sent = (sent_id[heads_ids].unsqueeze(1) == sent_id.unsqueeze(0)) # To save memory, only pass candidates from one sentence for each head @@ -571,18 +564,18 @@ class SpanPredictor(torch.nn.Module): self.emb(emb_ids[rows, cols]), ), dim=1) lengths = same_sent.sum(dim=1) - padding_mask = torch.arange(0, lengths.max().item(), device=words.device).unsqueeze(0) + padding_mask = torch.arange(0, lengths.max().item()).unsqueeze(0) padding_mask = (padding_mask < lengths.unsqueeze(1)) # [n_heads, max_sent_len] # [n_heads, max_sent_len, input_size * 2 + distance_emb_size] # This is necessary to allow the convolution layer to look at several # word scores - padded_pairs = torch.zeros(*padding_mask.shape, pair_matrix.shape[-1], device=words.device) + padded_pairs = torch.zeros(*padding_mask.shape, pair_matrix.shape[-1]) padded_pairs[padding_mask] = pair_matrix res = self.ffnn(padded_pairs) # [n_heads, n_candidates, last_layer_output] res = self.conv(res.permute(0, 2, 1)).permute(0, 2, 1) # [n_heads, n_candidates, 2] - scores = torch.full((heads_ids.shape[0], words.shape[0], 2), float('-inf'), device=words.device) + scores = torch.full((heads_ids.shape[0], words.shape[0], 2), float('-inf')) scores[rows, cols] = res[padding_mask] # Make sure that start <= head <= end during inference diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index f37f777fc..eb05011ec 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -417,7 +417,6 @@ DEFAULT_SPAN_PREDICTOR_MODEL = Config().from_str(default_span_predictor_config)[ default_config={ "model": DEFAULT_SPAN_PREDICTOR_MODEL, "input_prefix": "coref_head_clusters", - "target_prefix": "span_head_target_clusters", "output_prefix": "coref_clusters", }, default_score_weights={"span_predictor_f": 1.0, "span_predictor_p": None, "span_predictor_r": None}, @@ -427,7 +426,6 @@ def make_span_predictor( name: str, model, input_prefix: str = "coref_head_clusters", - target_prefix: str = "span_head_target_clusters", output_prefix: str = "coref_clusters", ) -> "SpanPredictor": """Create a SpanPredictor component.""" @@ -446,14 +444,12 @@ class SpanPredictor(TrainablePipe): name: str = "span_predictor", *, input_prefix: str = "coref_head_clusters", - target_prefix: str = "span_coref_head_clusters", output_prefix: str = "coref_clusters", ) -> None: self.vocab = vocab self.model = model self.name = name self.input_prefix = input_prefix - self.target_prefix = target_prefix self.output_prefix = output_prefix self.cfg = {} @@ -519,8 +515,9 @@ class SpanPredictor(TrainablePipe): for doc, eg in zip(docs, examples): # replicates the EntityLinker's behaviour and # copies annotations over https://bit.ly/3iweDcW + # takes 'coref_head_clusters' from the reference. for key, sg in eg.reference.spans.items(): - if key.startswith(self.target_prefix): + if key.startswith(self.input_prefix): doc.spans[key] = [doc[span.start:span.end] for span in sg] span_scores, backprop = self.model.begin_update([doc]) loss, d_scores = self.get_loss([eg], span_scores) @@ -573,7 +570,7 @@ class SpanPredictor(TrainablePipe): for eg in examples: # get gold data - gold = doc2clusters(eg.predicted, self.target_prefix) + gold = doc2clusters(eg.predicted, self.input_prefix) # flatten the gold data starts = [] ends = [] @@ -614,7 +611,6 @@ class SpanPredictor(TrainablePipe): doc = ex.predicted assert len(doc) > 2, "Coreference requires at least two tokens" doc.spans[f"{self.input_prefix}_0"] = [doc[0:1], doc[1:2]] - doc.spans[f"{self.target_prefix}_0"] = [doc[0:1], doc[1:2]] X.append(ex.predicted) Y.append(ex.reference) From 7304604edd6238d16f156b3f30db40d809f1a440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Fri, 25 Mar 2022 18:29:33 +0100 Subject: [PATCH 06/18] make sure predicted and reference keeps aligned --- spacy/pipeline/coref.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index eb05011ec..99bb611ff 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -130,7 +130,6 @@ class CoreferenceResolver(TrainablePipe): DOCS: https://spacy.io/api/coref#predict (TODO) """ - #print("DOCS", docs) out = [] for doc in docs: scores, idxs = self.model.predict([doc]) @@ -212,7 +211,6 @@ class CoreferenceResolver(TrainablePipe): # TODO check this causes no issues (in practice it runs) preds, backprop = self.model.begin_update([eg.predicted]) score_matrix, mention_idx = preds - loss, d_scores = self.get_loss([eg], score_matrix, mention_idx) total_loss += loss # TODO check shape here @@ -518,7 +516,8 @@ class SpanPredictor(TrainablePipe): # takes 'coref_head_clusters' from the reference. for key, sg in eg.reference.spans.items(): if key.startswith(self.input_prefix): - doc.spans[key] = [doc[span.start:span.end] for span in sg] + aligned_spans = eg.get_aligned_spans_x2y(sg) + doc.spans[key] = [doc[span.start:span.end] for span in aligned_spans] span_scores, backprop = self.model.begin_update([doc]) loss, d_scores = self.get_loss([eg], span_scores) total_loss += loss @@ -600,7 +599,7 @@ class SpanPredictor(TrainablePipe): *, nlp: Optional[Language] = None, ) -> None: - validate_get_examples(get_examples, "CoreferenceResolver.initialize") + validate_get_examples(get_examples, "SpanPredictor.initialize") X = [] Y = [] From 4fc40340f94d6dc47398dfa264804723b7e52b65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Mon, 28 Mar 2022 11:28:21 +0200 Subject: [PATCH 07/18] handle empty head_ids --- spacy/ml/models/coref.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py index 71082e7ac..7972f9160 100644 --- a/spacy/ml/models/coref.py +++ b/spacy/ml/models/coref.py @@ -133,6 +133,7 @@ def convert_coref_scorer_outputs( indices_xp = torch2xp(indices) return (scores_xp, indices_xp), convert_for_torch_backward + def convert_span_predictor_inputs( model: Model, X: Tuple[Ints1d, Floats2d, Ints1d], @@ -141,13 +142,17 @@ def convert_span_predictor_inputs( tok2vec, (sent_ids, head_ids) = X # Normally we shoudl use the input is_train, but for these two it's not relevant sent_ids = xp2torch(sent_ids[0], requires_grad=False) - head_ids = xp2torch(head_ids[0], requires_grad=False) + if not head_ids[0].size: + head_ids = torch.empty(size=(0,)) + else: + head_ids = xp2torch(head_ids[0], requires_grad=False) word_features = xp2torch(tok2vec[0], requires_grad=is_train) argskwargs = ArgsKwargs(args=(sent_ids, word_features, head_ids), kwargs={}) # TODO actually support backprop return argskwargs, lambda dX: [[]] + # TODO This probably belongs in the component, not the model. def predict_span_clusters(span_predictor: Model, sent_ids: Ints1d, @@ -543,6 +548,9 @@ class SpanPredictor(torch.nn.Module): Returns: torch.Tensor: span start/end scores, [n_heads, n_words, 2] """ + # If we don't receive heads, return empty + if heads_ids.nelement() == 0: + return torch.empty(size=(0,)) # Obtain distance embedding indices, [n_heads, n_words] relative_positions = (heads_ids.unsqueeze(1) - torch.arange(words.shape[0]).unsqueeze(0)) # make all valid distances positive @@ -550,7 +558,6 @@ class SpanPredictor(torch.nn.Module): # "too_far" emb_ids[(emb_ids < 0) + (emb_ids > 126)] = 127 # Obtain "same sentence" boolean mask, [n_heads, n_words] - sent_id = torch.tensor(sent_id) heads_ids = heads_ids.long() same_sent = (sent_id[heads_ids].unsqueeze(1) == sent_id.unsqueeze(0)) # To save memory, only pass candidates from one sentence for each head From e4b4b67ef6f627f7cd9cd313ab9274779c16c971 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Mon, 28 Mar 2022 11:29:00 +0200 Subject: [PATCH 08/18] handle empty clusters --- spacy/pipeline/coref.py | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 99bb611ff..5a4fa1ab9 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -458,27 +458,29 @@ class SpanPredictor(TrainablePipe): out = [] for doc in docs: # TODO check shape here - span_scores = self.model.predict(doc) - span_scores = span_scores[0] - # the information about clustering has to come from the input docs - # first let's convert the scores to a list of span idxs - start_scores = span_scores[:, :, 0] - end_scores = span_scores[:, :, 1] - starts = start_scores.argmax(axis=1) - ends = end_scores.argmax(axis=1) + span_scores = self.model.predict([doc]) + if span_scores.size: + # the information about clustering has to come from the input docs + # first let's convert the scores to a list of span idxs + start_scores = span_scores[:, :, 0] + end_scores = span_scores[:, :, 1] + starts = start_scores.argmax(axis=1) + ends = end_scores.argmax(axis=1) - # TODO check start < end + # TODO check start < end - # get the old clusters (shape will be preserved) - clusters = doc2clusters(doc, self.input_prefix) - cidx = 0 - out_clusters = [] - for cluster in clusters: - ncluster = [] - for mention in cluster: - ncluster.append( (starts[cidx], ends[cidx]) ) - cidx += 1 - out_clusters.append(ncluster) + # get the old clusters (shape will be preserved) + clusters = doc2clusters(doc, self.input_prefix) + cidx = 0 + out_clusters = [] + for cluster in clusters: + ncluster = [] + for mention in cluster: + ncluster.append((starts[cidx], ends[cidx])) + cidx += 1 + out_clusters.append(ncluster) + else: + out_clusters = [] out.append(out_clusters) return out @@ -628,7 +630,6 @@ class SpanPredictor(TrainablePipe): # XXX this is the only different part p_clusters = doc2clusters(ex.predicted, self.output_prefix) g_clusters = doc2clusters(ex.reference, self.output_prefix) - cluster_info = get_cluster_info(p_clusters, g_clusters) evaluator.update(cluster_info) From 06d680b269c87059ca1fd0381f025a2bcc60c5ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Mon, 28 Mar 2022 14:31:51 +0200 Subject: [PATCH 09/18] addressing suggestions by @polm --- spacy/pipeline/coref.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 5a4fa1ab9..340dde470 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -511,20 +511,24 @@ class SpanPredictor(TrainablePipe): set_dropout_rate(self.model, drop) total_loss = 0 - docs = [eg.predicted for eg in examples] - for doc, eg in zip(docs, examples): + old_spans = [eg.predicted.spans for eg in examples] + for eg in examples: # replicates the EntityLinker's behaviour and # copies annotations over https://bit.ly/3iweDcW - # takes 'coref_head_clusters' from the reference. + # https://github.com/explosion/spaCy/blob/master/spacy/pipeline/entity_linker.py#L313 + doc = eg.predicted for key, sg in eg.reference.spans.items(): if key.startswith(self.input_prefix): - aligned_spans = eg.get_aligned_spans_x2y(sg) - doc.spans[key] = [doc[span.start:span.end] for span in aligned_spans] + doc.spans[key] = eg.get_aligned_spans_y2x(sg) span_scores, backprop = self.model.begin_update([doc]) loss, d_scores = self.get_loss([eg], span_scores) total_loss += loss # TODO check shape here backprop(d_scores) + # Restore examples + for spans, eg in zip(old_spans, examples): + for key, sg in spans.items(): + eg.predicted.spans[key] = sg if sgd is not None: self.finish_update(sgd) From 7ff99a3acc38cf7202fc269f32774d3e1f613d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Mon, 28 Mar 2022 18:16:41 +0200 Subject: [PATCH 10/18] nicer restore --- spacy/pipeline/coref.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 340dde470..f0862c844 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -364,9 +364,7 @@ class CoreferenceResolver(TrainablePipe): for ex in examples: p_clusters = doc2clusters(ex.predicted, self.span_cluster_prefix) g_clusters = doc2clusters(ex.reference, self.span_cluster_prefix) - cluster_info = get_cluster_info(p_clusters, g_clusters) - evaluator.update(cluster_info) score = { @@ -511,12 +509,12 @@ class SpanPredictor(TrainablePipe): set_dropout_rate(self.model, drop) total_loss = 0 - old_spans = [eg.predicted.spans for eg in examples] for eg in examples: # replicates the EntityLinker's behaviour and # copies annotations over https://bit.ly/3iweDcW # https://github.com/explosion/spaCy/blob/master/spacy/pipeline/entity_linker.py#L313 doc = eg.predicted + old_spans = eg.predicted.spans for key, sg in eg.reference.spans.items(): if key.startswith(self.input_prefix): doc.spans[key] = eg.get_aligned_spans_y2x(sg) @@ -525,9 +523,8 @@ class SpanPredictor(TrainablePipe): total_loss += loss # TODO check shape here backprop(d_scores) - # Restore examples - for spans, eg in zip(old_spans, examples): - for key, sg in spans.items(): + # Restore example + for key, sg in old_spans.items(): eg.predicted.spans[key] = sg if sgd is not None: From 63a41ba50abd16c8b945bb39d8beff2879031cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Wed, 30 Mar 2022 17:28:20 +0200 Subject: [PATCH 11/18] fix score overwriting bug --- spacy/pipeline/coref.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index f0862c844..25a353405 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -636,14 +636,14 @@ class SpanPredictor(TrainablePipe): evaluator.update(cluster_info) score = { - "coref_f": evaluator.get_f1(), - "coref_p": evaluator.get_precision(), - "coref_r": evaluator.get_recall(), + "coref_span_f": evaluator.get_f1(), + "coref_span_p": evaluator.get_precision(), + "coref_span_r": evaluator.get_recall(), } scores.append(score) out = {} for field in ("f", "p", "r"): - fname = f"coref_{field}" + fname = f"coref_span_{field}" out[fname] = mean([ss[fname] for ss in scores]) return out From a1d021990379203a523c4c8683ce1bff620650f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Mon, 4 Apr 2022 15:26:15 +0200 Subject: [PATCH 12/18] prepare for aligned heads-spans training --- spacy/pipeline/coref.py | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 25a353405..1c0e56521 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -503,29 +503,20 @@ class SpanPredictor(TrainablePipe): losses = {} losses.setdefault(self.name, 0.0) validate_examples(examples, "SpanPredictor.update") - if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): + if not any(len(eg.reference) if eg.reference else 0 for eg in examples): # Handle cases where there are no tokens in any docs. return losses set_dropout_rate(self.model, drop) total_loss = 0 for eg in examples: - # replicates the EntityLinker's behaviour and - # copies annotations over https://bit.ly/3iweDcW - # https://github.com/explosion/spaCy/blob/master/spacy/pipeline/entity_linker.py#L313 - doc = eg.predicted - old_spans = eg.predicted.spans - for key, sg in eg.reference.spans.items(): - if key.startswith(self.input_prefix): - doc.spans[key] = eg.get_aligned_spans_y2x(sg) - span_scores, backprop = self.model.begin_update([doc]) + # For update we use the gold coref_head_clusters + # in the reference. + span_scores, backprop = self.model.begin_update([eg.reference]) loss, d_scores = self.get_loss([eg], span_scores) total_loss += loss # TODO check shape here backprop(d_scores) - # Restore example - for key, sg in old_spans.items(): - eg.predicted.spans[key] = sg if sgd is not None: self.finish_update(sgd) @@ -570,17 +561,14 @@ class SpanPredictor(TrainablePipe): # span_scores is a Floats3d. What are the axes? mention x token x start/end for eg in examples: - - # get gold data - gold = doc2clusters(eg.predicted, self.input_prefix) - # flatten the gold data starts = [] ends = [] - for cluster in gold: - for mention in cluster: - starts.append(mention[0]) - # XXX I think this was missing here - ends.append(mention[1] - 1) + for key, sg in eg.reference.spans.items(): + if key.startswith(self.output_prefix): + for mention in sg: + starts.append(mention.start) + ends.append(mention.end) + starts = self.model.ops.xp.asarray(starts) ends = self.model.ops.xp.asarray(ends) start_scores = span_scores[:, :, 0] From ef141ad3995410d64cd27a615b3f17ee21d59dd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Mon, 4 Apr 2022 18:10:09 +0200 Subject: [PATCH 13/18] span accuracy score --- spacy/pipeline/coref.py | 52 +++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 1c0e56521..c1db23d68 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -457,6 +457,7 @@ class SpanPredictor(TrainablePipe): for doc in docs: # TODO check shape here span_scores = self.model.predict([doc]) + print(span_scores) if span_scores.size: # the information about clustering has to come from the input docs # first let's convert the scores to a list of span idxs @@ -608,30 +609,35 @@ class SpanPredictor(TrainablePipe): self.model.initialize(X=X, Y=Y) def score(self, examples, **kwargs): - """Score a batch of examples.""" - # TODO This is basically the same as the main coref component - factor out? - + """ + Evaluate on reconstructing the correct spans around + gold heads. + """ scores = [] - for metric in (b_cubed, muc, ceafe): - evaluator = Evaluator(metric) + for eg in examples: + starts = [] + ends = [] + pred_starts = [] + pred_ends = [] + ref = eg.reference + pred = eg.predicted + for key, gold_sg in ref.spans.items(): + if key.startswith(self.input_prefix): + cluster_id = key.split('_')[-1] + # FIXME THIS DOESN'T WORK BECAUSE pred.spans are empty? + pred_sg = pred.spans[f"{self.output_prefix}_{cluster_id}"] + for gold_mention, pred_mention in zip(gold_sg, pred_sg): + starts.append(gold_mention.start) + ends.append(gold_mention.end) + pred_starts.append(pred_mention.start) + pred_ends.append(pred_mention.end) - for ex in examples: - # XXX this is the only different part - p_clusters = doc2clusters(ex.predicted, self.output_prefix) - g_clusters = doc2clusters(ex.reference, self.output_prefix) - cluster_info = get_cluster_info(p_clusters, g_clusters) + starts = self.model.ops.xp.asarray(starts) + ends = self.model.ops.xp.asarray(ends) + pred_starts = self.model.ops.xp.asarray(pred_starts) + pred_ends = self.model.ops.xp.asarray(pred_ends) + correct = ((starts == pred_starts) * (ends == pred_ends)).sum() + scores.append(correct) - evaluator.update(cluster_info) - - score = { - "coref_span_f": evaluator.get_f1(), - "coref_span_p": evaluator.get_precision(), - "coref_span_r": evaluator.get_recall(), - } - scores.append(score) - - out = {} - for field in ("f", "p", "r"): - fname = f"coref_span_{field}" - out[fname] = mean([ss[fname] for ss in scores]) + out = {"span_accuracy": mean(scores)} return out From 3ba913109d27827639eaa2bf91c1693bed7f33f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Thu, 7 Apr 2022 13:20:12 +0200 Subject: [PATCH 14/18] update with eg.predited as other components --- spacy/pipeline/coref.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index c1db23d68..1b062ed9a 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -457,7 +457,6 @@ class SpanPredictor(TrainablePipe): for doc in docs: # TODO check shape here span_scores = self.model.predict([doc]) - print(span_scores) if span_scores.size: # the information about clustering has to come from the input docs # first let's convert the scores to a list of span idxs @@ -513,7 +512,7 @@ class SpanPredictor(TrainablePipe): for eg in examples: # For update we use the gold coref_head_clusters # in the reference. - span_scores, backprop = self.model.begin_update([eg.reference]) + span_scores, backprop = self.model.begin_update([eg.predicted]) loss, d_scores = self.get_loss([eg], span_scores) total_loss += loss # TODO check shape here @@ -622,10 +621,9 @@ class SpanPredictor(TrainablePipe): ref = eg.reference pred = eg.predicted for key, gold_sg in ref.spans.items(): - if key.startswith(self.input_prefix): + if key.startswith(self.output_prefix): cluster_id = key.split('_')[-1] - # FIXME THIS DOESN'T WORK BECAUSE pred.spans are empty? - pred_sg = pred.spans[f"{self.output_prefix}_{cluster_id}"] + pred_sg = pred.spans[key] for gold_mention, pred_mention in zip(gold_sg, pred_sg): starts.append(gold_mention.start) ends.append(gold_mention.end) From 2a1ad4c5d294de02af668e07d19894491afc3204 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Fri, 8 Apr 2022 14:56:44 +0200 Subject: [PATCH 15/18] add backprop callback to spanpredictor --- spacy/ml/models/coref.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py index 7972f9160..0b533daf0 100644 --- a/spacy/ml/models/coref.py +++ b/spacy/ml/models/coref.py @@ -104,13 +104,13 @@ def convert_coref_scorer_inputs( # just use the first # TODO real batching X = X[0] - - word_features = xp2torch(X, requires_grad=is_train) + def backprop(args: ArgsKwargs) -> List[Floats2d]: # convert to xp and wrap in list gradients = torch2xp(args.args[0]) return [gradients] + return ArgsKwargs(args=(word_features, ), kwargs={}), backprop @@ -141,16 +141,22 @@ def convert_span_predictor_inputs( ): tok2vec, (sent_ids, head_ids) = X # Normally we shoudl use the input is_train, but for these two it's not relevant + + def backprop(args: ArgsKwargs) -> List[Floats2d]: + # convert to xp and wrap in list + gradients = torch2xp(args.args[1]) + return [[gradients], None] + + word_features = xp2torch(tok2vec[0], requires_grad=is_train) sent_ids = xp2torch(sent_ids[0], requires_grad=False) if not head_ids[0].size: head_ids = torch.empty(size=(0,)) else: head_ids = xp2torch(head_ids[0], requires_grad=False) - word_features = xp2torch(tok2vec[0], requires_grad=is_train) argskwargs = ArgsKwargs(args=(sent_ids, word_features, head_ids), kwargs={}) # TODO actually support backprop - return argskwargs, lambda dX: [[]] + return argskwargs, backprop # TODO This probably belongs in the component, not the model. @@ -247,7 +253,6 @@ def head_data_forward(model, docs, is_train): heads.append(span[0].i) heads = model.ops.asarray2i(heads) head_ids.append(heads) - # each of these is a list with one entry per doc # backprop is just a placeholder # TODO it would probably be better to have a list of tuples than two lists of arrays @@ -584,7 +589,6 @@ class SpanPredictor(torch.nn.Module): scores = torch.full((heads_ids.shape[0], words.shape[0], 2), float('-inf')) scores[rows, cols] = res[padding_mask] - # Make sure that start <= head <= end during inference if not self.training: valid_starts = torch.log((relative_positions >= 0).to(torch.float)) From 7a239f2ec7c71a494f2380686fdbcfdd421e7fa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Fri, 8 Apr 2022 14:57:19 +0200 Subject: [PATCH 16/18] report start- and end-accuracies separately --- spacy/pipeline/coref.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 1b062ed9a..02c93f712 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -510,13 +510,11 @@ class SpanPredictor(TrainablePipe): total_loss = 0 for eg in examples: - # For update we use the gold coref_head_clusters - # in the reference. span_scores, backprop = self.model.begin_update([eg.predicted]) loss, d_scores = self.get_loss([eg], span_scores) total_loss += loss # TODO check shape here - backprop(d_scores) + backprop((d_scores)) if sgd is not None: self.finish_update(sgd) @@ -612,7 +610,8 @@ class SpanPredictor(TrainablePipe): Evaluate on reconstructing the correct spans around gold heads. """ - scores = [] + start_scores = [] + end_scores = [] for eg in examples: starts = [] ends = [] @@ -622,7 +621,6 @@ class SpanPredictor(TrainablePipe): pred = eg.predicted for key, gold_sg in ref.spans.items(): if key.startswith(self.output_prefix): - cluster_id = key.split('_')[-1] pred_sg = pred.spans[key] for gold_mention, pred_mention in zip(gold_sg, pred_sg): starts.append(gold_mention.start) @@ -634,8 +632,12 @@ class SpanPredictor(TrainablePipe): ends = self.model.ops.xp.asarray(ends) pred_starts = self.model.ops.xp.asarray(pred_starts) pred_ends = self.model.ops.xp.asarray(pred_ends) - correct = ((starts == pred_starts) * (ends == pred_ends)).sum() - scores.append(correct) - - out = {"span_accuracy": mean(scores)} + start_accuracy = (starts == pred_starts).mean() + end_accuracy = (ends == pred_ends).mean() + start_scores.append(float(start_accuracy)) + end_scores.append(float(end_accuracy)) + out = { + "span_start_accuracy": mean(start_scores), + "span_end_accuracy": mean(end_scores) + } return out From 6aedd98d02b55672469556f4d61f2ad6254f3759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A1d=C3=A1r=20=C3=81kos?= Date: Mon, 11 Apr 2022 16:10:14 +0200 Subject: [PATCH 17/18] fixing scorer --- spacy/pipeline/coref.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/spacy/pipeline/coref.py b/spacy/pipeline/coref.py index 02c93f712..fc04d1a3e 100644 --- a/spacy/pipeline/coref.py +++ b/spacy/pipeline/coref.py @@ -511,10 +511,13 @@ class SpanPredictor(TrainablePipe): total_loss = 0 for eg in examples: span_scores, backprop = self.model.begin_update([eg.predicted]) - loss, d_scores = self.get_loss([eg], span_scores) - total_loss += loss - # TODO check shape here - backprop((d_scores)) + # FIXME, this only happens once in the first 1000 docs of OntoNotes + # and I'm not sure yet why. + if span_scores.size: + loss, d_scores = self.get_loss([eg], span_scores) + total_loss += loss + # TODO check shape here + backprop((d_scores)) if sgd is not None: self.finish_update(sgd) @@ -557,7 +560,6 @@ class SpanPredictor(TrainablePipe): assert len(examples) == 1, "Only fake batching is supported." # starts and ends are gold starts and ends (Ints1d) # span_scores is a Floats3d. What are the axes? mention x token x start/end - for eg in examples: starts = [] ends = [] @@ -610,8 +612,8 @@ class SpanPredictor(TrainablePipe): Evaluate on reconstructing the correct spans around gold heads. """ - start_scores = [] - end_scores = [] + scores = [] + xp = self.model.ops.xp for eg in examples: starts = [] ends = [] @@ -628,16 +630,11 @@ class SpanPredictor(TrainablePipe): pred_starts.append(pred_mention.start) pred_ends.append(pred_mention.end) - starts = self.model.ops.xp.asarray(starts) - ends = self.model.ops.xp.asarray(ends) - pred_starts = self.model.ops.xp.asarray(pred_starts) - pred_ends = self.model.ops.xp.asarray(pred_ends) - start_accuracy = (starts == pred_starts).mean() - end_accuracy = (ends == pred_ends).mean() - start_scores.append(float(start_accuracy)) - end_scores.append(float(end_accuracy)) - out = { - "span_start_accuracy": mean(start_scores), - "span_end_accuracy": mean(end_scores) - } - return out + starts = xp.asarray(starts) + ends = xp.asarray(ends) + pred_starts = xp.asarray(pred_starts) + pred_ends = xp.asarray(pred_ends) + correct = (starts == pred_starts) * (ends == pred_ends) + accuracy = correct.mean() + scores.append(float(accuracy)) + return {"span_accuracy": mean(scores)} From e512874c809bd35429979c66943af4212486a33e Mon Sep 17 00:00:00 2001 From: kadarakos Date: Tue, 10 May 2022 16:40:31 +0000 Subject: [PATCH 18/18] small refactor and docs --- spacy/ml/models/coref.py | 189 ++++++++++++++------------------------- 1 file changed, 67 insertions(+), 122 deletions(-) diff --git a/spacy/ml/models/coref.py b/spacy/ml/models/coref.py index 4e8e604d8..435c3bc80 100644 --- a/spacy/ml/models/coref.py +++ b/spacy/ml/models/coref.py @@ -1,14 +1,14 @@ from typing import List, Tuple import torch -from thinc.api import Model, chain, tuplify +from thinc.api import Model, chain from thinc.api import PyTorchWrapper, ArgsKwargs -from thinc.types import Floats2d, Ints1d, Ints2d +from thinc.types import Floats2d, Ints2d from thinc.util import xp2torch, torch2xp from ...tokens import Doc from ...util import registry -from .coref_util import add_dummy, get_sentence_ids +from .coref_util import add_dummy @registry.architectures("spacy.Coref.v1") @@ -19,7 +19,6 @@ def build_wl_coref_model( n_hidden_layers: int = 1, # TODO rename to "depth"? dropout: float = 0.3, # pairs to keep per mention after rough scoring - # TODO change to meaningful name rough_k: int = 50, # TODO is this not a training loop setting? a_scoring_batch_size: int = 512, @@ -34,7 +33,6 @@ def build_wl_coref_model( dim = 768 with Model.define_operators({">>": chain}): - # TODO chain tok2vec with these models coref_scorer = PyTorchWrapper( CorefScorer( dim, @@ -49,18 +47,6 @@ def build_wl_coref_model( convert_outputs=convert_coref_scorer_outputs, ) coref_model = tok2vec >> coref_scorer - # XXX just ignore this until the coref scorer is integrated - # span_predictor = PyTorchWrapper( - # SpanPredictor( - # TODO this was hardcoded to 1024, check - # hidden_size, - # sp_embedding_size, - # ), - # convert_inputs=convert_span_predictor_inputs - # ) - # TODO combine models so output is uniform (just one forward pass) - # It may be reasonable to have an option to disable span prediction, - # and just return words as spans. return coref_model @@ -95,46 +81,13 @@ def convert_coref_scorer_outputs(model: Model, inputs_outputs, is_train: bool): return (scores_xp, indices_xp), convert_for_torch_backward -# TODO add docstring for this, maybe move to utils. -# This might belong in the component. -def _clusterize(model, scores: Floats2d, top_indices: Ints2d): - xp = model.ops.xp - antecedents = scores.argmax(axis=1) - 1 - not_dummy = antecedents >= 0 - coref_span_heads = xp.arange(0, len(scores))[not_dummy] - antecedents = top_indices[coref_span_heads, antecedents[not_dummy]] - n_words = scores.shape[0] - nodes = [GraphNode(i) for i in range(n_words)] - for i, j in zip(coref_span_heads.tolist(), antecedents.tolist()): - nodes[i].link(nodes[j]) - assert nodes[i] is not nodes[j] - - clusters = [] - for node in nodes: - if len(node.links) > 0 and not node.visited: - cluster = [] - stack = [node] - while stack: - current_node = stack.pop() - current_node.visited = True - cluster.append(current_node.id) - stack.extend(link for link in current_node.links if not link.visited) - assert len(cluster) > 1 - clusters.append(sorted(cluster)) - return sorted(clusters) - - class CorefScorer(torch.nn.Module): - """Combines all coref modules together to find coreferent spans. - - Attributes: - epochs_trained (int): number of epochs the model has been trained for - + """ + Combines all coref modules together to find coreferent token pairs. Submodules (in the order of their usage in the pipeline): - rough_scorer (RoughScorer) - pw (PairwiseEncoder) - a_scorer (AnaphoricityScorer) - sp (SpanPredictor) + - rough_scorer (RoughScorer) that prunes candidate pairs + - pw (DistancePairwiseEncoder) that computes pairwise features + - a_scorer (AnaphoricityScorer) produces the final scores """ def __init__( @@ -149,50 +102,54 @@ class CorefScorer(torch.nn.Module): ): super().__init__() """ - A newly created model is set to evaluation mode. - - Args: - epochs_trained (int): the number of epochs finished - (useful for warm start) + dim: Size of the input features. + dist_emb_size: Size of the distance embeddings. + hidden_size: Size of the coreference candidate embeddings. + n_layers: Numbers of layers in the AnaphoricityScorer. + dropout_rate: Dropout probability to apply across all modules. + roughk: Number of candidates the RoughScorer returns. + batch_size: Internal batch-size for the more expensive AnaphoricityScorer. """ + self.dropout = torch.nn.Dropout(dropout_rate) + self.batch_size = batch_size + # Modules + self.lstm = torch.nn.LSTM( + input_size=dim, + hidden_size=dim, + batch_first=True, + ) + self.rough_scorer = RoughScorer(dim, dropout_rate, roughk) self.pw = DistancePairwiseEncoder(dist_emb_size, dropout_rate) - # TODO clean this up - bert_emb = dim - pair_emb = bert_emb * 3 + self.pw.shape + pair_emb = dim * 3 + self.pw.shape self.a_scorer = AnaphoricityScorer( pair_emb, hidden_size, n_layers, dropout_rate ) - self.lstm = torch.nn.LSTM( - input_size=bert_emb, - hidden_size=bert_emb, - batch_first=True, - ) - self.dropout = torch.nn.Dropout(dropout_rate) - self.rough_scorer = RoughScorer(bert_emb, dropout_rate, roughk) - self.batch_size = batch_size def forward(self, word_features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ - This is a massive method, but it made sense to me to not split it into - several ones to let one see the data flow. + 1. LSTM encodes the incoming word_features. + 2. The RoughScorer scores and prunes the candidates. + 3. The DistancePairwiseEncoder embeds the distance between remaning pairs. + 4. The AnaphoricityScorer scores all pairs in mini-batches. - Args: - word_features: torch.Tensor containing word encodings - Returns: - coreference scores and top indices + word_features: torch.Tensor containing word encodings + + returns: + coref_scores: n_words x roughk floats. + top_indices: n_words x roughk integers. """ - # words [n_words, span_emb] - # cluster_ids [n_words] self.lstm.flatten_parameters() # XXX without this there's a warning word_features = torch.unsqueeze(word_features, dim=0) words, _ = self.lstm(word_features) words = words.squeeze() + # words: n_words x dim words = self.dropout(words) # Obtain bilinear scores and leave only top-k antecedents for each word - # top_rough_scores [n_words, n_ants] - # top_indices [n_words, n_ants] + # top_rough_scores: (n_words x roughk) + # top_indices: (n_words x roughk) top_rough_scores, top_indices = self.rough_scorer(words) - # Get pairwise features [n_words, n_ants, n_pw_features] + # Get pairwise features + # (n_words x roughk x n_pw_features) pw = self.pw(top_indices) batch_size = self.batch_size a_scores_lst: List[torch.Tensor] = [] @@ -272,13 +229,8 @@ class AnaphoricityScorer(torch.nn.Module): def _ffnn(self, x: torch.Tensor) -> torch.Tensor: """ - Calculates anaphoricity scores. - - Args: - x: tensor of shape [batch_size, n_ants, n_features] - - Returns: - tensor of shape [batch_size, n_ants] + x: tensor of shape (batch_size x roughk x n_features + returns: tensor of shape (batch_size x rough_k) """ x = self.out(self.hidden(x)) return x.squeeze(2) @@ -293,21 +245,18 @@ class AnaphoricityScorer(torch.nn.Module): """ Builds the matrix used as input for AnaphoricityScorer. - Args: - all_mentions (torch.Tensor): [n_mentions, mention_emb], - all the valid mentions of the document, - can be on a different device - mentions_batch (torch.Tensor): [batch_size, mention_emb], - the mentions of the current batch, - is expected to be on the current device - pw_batch (torch.Tensor): [batch_size, n_ants, pw_emb], - pairwise features of the current batch, - is expected to be on the current device - top_indices_batch (torch.Tensor): [batch_size, n_ants], - indices of antecedents of each mention + all_mentions: (n_mentions x mention_emb), + all the valid mentions of the document, + can be on a different device + mentions_batch: (batch_size x mention_emb), + the mentions of the current batch. + pw_batch: (batch_size x roughk x pw_emb), + pairwise distance features of the current batch. + top_indices_batch: (batch_size x n_ants), + indices of antecedents of each mention Returns: - torch.Tensor: [batch_size, n_ants, pair_emb] + out: pairwise features (batch_size x n_ants x pair_emb) """ emb_size = mentions_batch.shape[1] n_ants = pw_batch.shape[1] @@ -322,16 +271,15 @@ class AnaphoricityScorer(torch.nn.Module): class RoughScorer(torch.nn.Module): """ - Is needed to give a roughly estimate of the anaphoricity of two candidates, - only top scoring candidates are considered on later steps to reduce - computational complexity. + Cheaper module that gives a rough estimate of the anaphoricity of two + candidates, only top scoring candidates are considered on later + steps to reduce computational cost. """ def __init__(self, features: int, dropout_rate: float, rough_k: float): super().__init__() self.dropout = torch.nn.Dropout(dropout_rate) self.bilinear = torch.nn.Linear(features, features) - self.k = rough_k def forward( @@ -348,21 +296,6 @@ class RoughScorer(torch.nn.Module): pair_mask = torch.log((pair_mask > 0).to(torch.float)) bilinear_scores = self.dropout(self.bilinear(mentions)).mm(mentions.T) rough_scores = pair_mask + bilinear_scores - - return self._prune(rough_scores) - - def _prune(self, rough_scores: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Selects top-k rough antecedent scores for each mention. - - Args: - rough_scores: tensor of shape [n_mentions, n_mentions], containing - rough antecedent scores of each mention-antecedent pair. - - Returns: - FloatTensor of shape [n_mentions, k], top rough scores - LongTensor of shape [n_mentions, k], top indices - """ top_scores, indices = torch.topk( rough_scores, k=min(self.k, len(rough_scores)), dim=1, sorted=False ) @@ -371,6 +304,18 @@ class RoughScorer(torch.nn.Module): class DistancePairwiseEncoder(torch.nn.Module): def __init__(self, embedding_size, dropout_rate): + """ + Takes the top_indices indicating, which is a ranked + list for each word and its most likely corresponding + anaphora candidates. For each of these pairs it looks + up a distance embedding from a table, where the distance + corresponds to the log-distance. + + embedding_size: int, + Dimensionality of the distance-embeddings table. + dropout_rate: float, + Dropout probability. + """ super().__init__() emb_size = embedding_size self.distance_emb = torch.nn.Embedding(9, emb_size) @@ -378,7 +323,7 @@ class DistancePairwiseEncoder(torch.nn.Module): self.shape = emb_size def forward( - self, # type: ignore # pylint: disable=arguments-differ #35566 in pytorch + self, top_indices: torch.Tensor, ) -> torch.Tensor: word_ids = torch.arange(0, top_indices.size(0))