Delete all the coref-hoi code

2025-09-18 01:52:37 +03:00 · 2022-03-15 20:05:24 +09:00 · 2022-03-15 20:05:24 +09:00 · d0ae2590db
commit d0ae2590db
parent abdc7d87af
1 changed files with 1 additions and 445 deletions
--- a/spacy/ml/models/coref.py
+++ b/spacy/ml/models/coref.py
@ -11,457 +11,13 @@ from ...tokens import Doc
 from ...util import registry
 from ..extract_spans import extract_spans
 from .coref_util import get_candidate_mentions, select_non_crossing_spans, topk
@registry.architectures("spacy.Coref.v1")
 def build_coref(
    tok2vec: Model[List[Doc], List[Floats2d]],
    get_mentions: Any = get_candidate_mentions,
    hidden: int = 1000,
    dropout: float = 0.3,
    mention_limit: int = 3900,
    # TODO this needs a better name. It limits the max mentions as a ratio of
    # the token count.
    mention_limit_ratio: float = 0.4,
    max_span_width: int = 20,
    antecedent_limit: int = 50,
 ):
    dim = tok2vec.get_dim("nO") * 3
    span_embedder = build_span_embedder(get_mentions, max_span_width)
    with Model.define_operators({">>": chain, "&": tuplify, "+": add}):
        mention_scorer = (
            Linear(nI=dim, nO=hidden)
            >> Relu(nI=hidden, nO=hidden)
            >> Dropout(dropout)
            >> Linear(nI=hidden, nO=hidden)
            >> Relu(nI=hidden, nO=hidden)
            >> Dropout(dropout)
            >> Linear(nI=hidden, nO=1)
        )
        mention_scorer.initialize()
        # TODO make feature_embed_size a param
        feature_embed_size = 20
        width_scorer = build_width_scorer(max_span_width, hidden, feature_embed_size)
        bilinear = Linear(nI=dim, nO=dim) >> Dropout(dropout)
        bilinear.initialize()
        ms = (build_take_vecs() >> mention_scorer) + width_scorer
        model = (
            (tok2vec & noop())
            >> span_embedder
            >> (ms & noop())
            >> build_coarse_pruner(mention_limit, mention_limit_ratio)
            >> build_ant_scorer(bilinear, Dropout(dropout), antecedent_limit)
        )
    return model
@dataclass
 class SpanEmbeddings:
    indices: Ints2d  # Array with 2 columns (for start and end index)
    vectors: Ragged  # Ragged[Floats2d] # One vector per span
    # NB: We assume that the indices refer to a concatenated Floats2d that
    # has one row per token in the *batch* of documents. This makes it unambiguous
    # which row is in which document, because if the lengths are e.g. [10, 5],
    # a span starting at 11 must be starting at token 2 of doc 1. A bug could
    # potentially cause you to have a span which crosses a doc boundary though,
    # which would be bad.
    # The lengths in the Ragged are not the tokens per doc, but the number of
    # mentions per doc.
    def __add__(self, right):
        out = self.vectors.data + right.vectors.data
        return SpanEmbeddings(self.indices, Ragged(out, self.vectors.lengths))
    def __iadd__(self, right):
        self.vectors.data += right.vectors.data
        return self
 def build_width_scorer(max_span_width, hidden_size, feature_embed_size=20):
    span_width_prior = (
        Embed(nV=max_span_width, nO=feature_embed_size)
        >> Linear(nI=feature_embed_size, nO=hidden_size)
        >> Relu(nI=hidden_size, nO=hidden_size)
        >> Dropout()
        >> Linear(nI=hidden_size, nO=1)
    )
    span_width_prior.initialize()
    model = Model("WidthScorer", forward=width_score_forward, layers=[span_width_prior])
    model.set_ref("width_prior", span_width_prior)
    return model
 def width_score_forward(
    model, embeds: SpanEmbeddings, is_train
 ) -> Tuple[Floats1d, Callable]:
    # calculate widths, subtracting 1 so it's 0-index
    w_ffnn = model.get_ref("width_prior")
    idxs = embeds.indices
    widths = idxs[:, 1] - idxs[:, 0] - 1
    wscores, width_b = w_ffnn(widths, is_train)
    lens = embeds.vectors.lengths
    def width_score_backward(d_score: Floats1d) -> SpanEmbeddings:
        dX = width_b(d_score)
        vecs = Ragged(dX, lens)
        return SpanEmbeddings(idxs, vecs)
    return wscores, width_score_backward
 # model converting a Doc/Mention to span embeddings
 # get_mentions: Callable[Doc, Pairs[int]]
 def build_span_embedder(
    get_mentions: Callable,
    max_span_width: int = 20,
 ) -> Model[Tuple[List[Floats2d], List[Doc]], SpanEmbeddings]:
    with Model.define_operators({">>": chain, "|": concatenate}):
        span_reduce = extract_spans() >> (
            reduce_first() | reduce_last() | reduce_mean()
        )
    model = Model(
        "SpanEmbedding",
        forward=span_embeddings_forward,
        attrs={
            "get_mentions": get_mentions,
            # XXX might be better to make this an implicit parameter in the
            # mention generator
            "max_span_width": max_span_width,
        },
        layers=[span_reduce],
    )
    model.set_ref("span_reducer", span_reduce)
    return model
 def span_embeddings_forward(
    model, inputs: Tuple[List[Floats2d], List[Doc]], is_train
 ) -> Tuple[SpanEmbeddings, Callable]:
    ops = model.ops
    xp = ops.xp
    tokvecs, docs = inputs
    # TODO fix this
    dim = tokvecs[0].shape[1]
    get_mentions = model.attrs["get_mentions"]
    max_span_width = model.attrs["max_span_width"]
    mentions = ops.alloc2i(0, 2)
    docmenlens = []  # number of mentions per doc
    for doc in docs:
        starts, ends = get_mentions(doc, max_span_width)
        docmenlens.append(len(starts))
        cments = ops.asarray2i([starts, ends]).transpose()
        mentions = xp.concatenate((mentions, cments))
    # TODO support attention here
    tokvecs = xp.concatenate(tokvecs)
    doclens = [len(doc) for doc in docs]
    tokvecs_r = Ragged(tokvecs, doclens)
    mentions_r = Ragged(mentions, docmenlens)
    span_reduce = model.get_ref("span_reducer")
    spanvecs, span_reduce_back = span_reduce((tokvecs_r, mentions_r), is_train)
    embeds = Ragged(spanvecs, docmenlens)
    def backprop_span_embed(dY: SpanEmbeddings) -> Tuple[List[Floats2d], List[Doc]]:
        grad, idxes = span_reduce_back(dY.vectors.data)
        oweights = []
        offset = 0
        for doclen in doclens:
            hi = offset + doclen
            oweights.append(grad.data[offset:hi])
            offset = hi
        return oweights, docs
    return SpanEmbeddings(mentions, embeds), backprop_span_embed
 def build_coarse_pruner(
    mention_limit: int,
    mention_limit_ratio: float,
 ) -> Model[SpanEmbeddings, SpanEmbeddings]:
    model = Model(
        "CoarsePruner",
        forward=coarse_prune,
        attrs={
            "mention_limit": mention_limit,
            "mention_limit_ratio": mention_limit_ratio,
        },
    )
    return model
 def coarse_prune(
    model, inputs: Tuple[Floats1d, SpanEmbeddings], is_train
 ) -> Tuple[Tuple[Floats1d, SpanEmbeddings], Callable]:
    """Given scores for mention, output the top non-crossing mentions.
    Mentions can contain other mentions, but candidate mentions cannot cross each other.
    """
    rawscores, spanembeds = inputs
    scores = rawscores.flatten()
    mention_limit = model.attrs["mention_limit"]
    mention_limit_ratio = model.attrs["mention_limit_ratio"]
    # XXX: Issue here. Don't need docs to find crossing spans, but might for the limits.
    # In old code the limit can be:
    # - hard number per doc
    # - ratio of tokens in the doc
    offset = 0
    selected = []
    sellens = []
    for menlen in spanembeds.vectors.lengths:
        hi = offset + menlen
        cscores = scores[offset:hi]
        # negate it so highest numbers come first
        # This is relatively slow but can't be skipped.
        tops = (model.ops.xp.argsort(-1 * cscores)).tolist()
        starts = spanembeds.indices[offset:hi, 0].tolist()
        ends = spanembeds.indices[offset:hi:, 1].tolist()
        # calculate the doc length
        doclen = ends[-1] - starts[0]
        # XXX seems to make more sense to use menlen than doclen here?
        # coref-hoi uses doclen (number of words). 
        mlimit = min(mention_limit, int(mention_limit_ratio * doclen))
        # csel is a 1d integer list
        csel = select_non_crossing_spans(tops, starts, ends, mlimit)
        # add the offset so these indices are absolute
        csel = [ii + offset for ii in csel]
        # this should be constant because short choices are padded
        sellens.append(len(csel))
        selected += csel
        offset += menlen
    selected = model.ops.asarray1i(selected)
    top_spans = spanembeds.indices[selected]
    top_vecs = spanembeds.vectors.data[selected]
    out = SpanEmbeddings(top_spans, Ragged(top_vecs, sellens))
    # save some variables so the embeds can be garbage collected
    idxlen = spanembeds.indices.shape[0]
    vecshape = spanembeds.vectors.data.shape
    indices = spanembeds.indices
    veclens = out.vectors.lengths
    def coarse_prune_backprop(
        dY: Tuple[Floats1d, SpanEmbeddings]
    ) -> Tuple[Floats1d, SpanEmbeddings]:
        dYscores, dYembeds = dY
        dXscores = model.ops.alloc1f(idxlen)
        dXscores[selected] = dYscores.flatten()
        dXvecs = model.ops.alloc2f(*vecshape)
        dXvecs[selected] = dYembeds.vectors.data
        rout = Ragged(dXvecs, veclens)
        dXembeds = SpanEmbeddings(indices, rout)
        # inflate for mention scorer
        dXscores = model.ops.xp.expand_dims(dXscores, 1)
        return (dXscores, dXembeds)
    return (scores[selected], out), coarse_prune_backprop
 def build_take_vecs() -> Model[SpanEmbeddings, Floats2d]:
    # this just gets vectors out of spanembeddings
    # XXX Might be better to convert SpanEmbeddings to a tuple and use with_getitem
    return Model("TakeVecs", forward=take_vecs_forward)
 def take_vecs_forward(model, inputs: SpanEmbeddings, is_train) -> Floats2d:
    idxs = inputs.indices
    lens = inputs.vectors.lengths
    def backprop(dY: Floats2d) -> SpanEmbeddings:
        vecs = Ragged(dY, lens)
        return SpanEmbeddings(idxs, vecs)
    return inputs.vectors.data, backprop
 def build_ant_scorer(
    bilinear, dropout, ant_limit=50
 ) -> Model[Tuple[Floats1d, SpanEmbeddings], List[Floats2d]]:
    model = Model(
        "AntScorer",
        forward=ant_scorer_forward,
        layers=[bilinear, dropout],
        attrs={
            "ant_limit": ant_limit,
        },
    )
    model.set_ref("bilinear", bilinear)
    model.set_ref("dropout", dropout)
    return model
 def ant_scorer_forward(
    model, inputs: Tuple[Floats1d, SpanEmbeddings], is_train
 ) -> Tuple[Tuple[List[Tuple[Floats2d, Ints2d]], Ints2d], Callable]:
    ops = model.ops
    xp = ops.xp
    ant_limit = model.attrs["ant_limit"]
    # this contains the coarse bilinear in coref-hoi
    # coarse bilinear is a single layer linear network
    # TODO make these proper refs
    bilinear = model.get_ref("bilinear")
    dropout = model.get_ref("dropout")
    mscores, sembeds = inputs
    vecs = sembeds.vectors  # ragged
    offset = 0
    backprops = []
    out = []
    for ll in vecs.lengths:
        hi = offset + ll
        # each iteration is one doc
        # first calculate the pairwise product scores
        cvecs = vecs.data[offset:hi]
        pw_prod, prod_back = pairwise_product(bilinear, dropout, cvecs, is_train)
        # now calculate the pairwise mention scores
        ms = mscores[offset:hi].flatten()
        pw_sum, pw_sum_back = pairwise_sum(ops, ms)
        # make a mask so antecedents precede referrents
        ant_range = xp.arange(0, cvecs.shape[0])
        # This will take the log of 0, which causes a warning, but we're doing
        # it on purpose so we can just ignore the warning.
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=RuntimeWarning)
            mask = xp.log(
                (xp.expand_dims(ant_range, 1) - xp.expand_dims(ant_range, 0)) >= 1
            ).astype("f")
        scores = pw_prod + pw_sum + mask
        top_limit = min(ant_limit, len(scores))
        top_scores, top_scores_idx = topk(xp, scores, top_limit)
        # now add the placeholder
        placeholder = ops.alloc2f(scores.shape[0], 1)
        top_scores = xp.concatenate((placeholder, top_scores), 1)
        out.append((top_scores, top_scores_idx))
        # In the full model these scores can be further refined. In the current
        # state of this model we're done here, so this pruning is less important,
        # but it's still helpful for reducing memory usage (since scores can be
        # garbage collected when the loop exits).
        offset += ll
        backprops.append((prod_back, pw_sum_back))
    # save vars for gc
    vecshape = vecs.data.shape
    veclens = vecs.lengths
    scoreshape = mscores.shape
    idxes = sembeds.indices
    def backprop(
        dYs: Tuple[List[Tuple[Floats2d, Ints2d]], Ints2d]
    ) -> Tuple[Floats2d, SpanEmbeddings]:
        dYscores, dYembeds = dYs
        dXembeds = Ragged(ops.alloc2f(*vecshape), veclens)
        dXscores = ops.alloc1f(*scoreshape)
        offset = 0
        for dy, (prod_back, pw_sum_back), ll in zip(dYscores, backprops, veclens):
            hi = offset + ll
            dyscore, dyidx = dy
            # remove the placeholder
            dyscore = dyscore[:, 1:]
            # the full score grid is square
            fullscore = ops.alloc2f(ll, ll)
            for ii, (ridx, rscores) in enumerate(zip(dyidx, dyscore)):
                fullscore[ii][ridx] = rscores
            dXembeds.data[offset:hi] = prod_back(fullscore)
            dXscores[offset:hi] = pw_sum_back(fullscore)
            offset = hi
        # make it fit back into the linear
        dXscores = xp.expand_dims(dXscores, 1)
        return (dXscores, SpanEmbeddings(idxes, dXembeds))
    return (out, sembeds.indices), backprop
 def pairwise_sum(ops, mention_scores: Floats1d) -> Tuple[Floats2d, Callable]:
    """Find the most likely mention-antecedent pairs."""
    # This doesn't use multiplication because two items with low mention scores
    # don't make a good candidate pair.
    pw_sum = ops.xp.expand_dims(mention_scores, 1) + ops.xp.expand_dims(
        mention_scores, 0
    )
    def backward(d_pwsum: Floats2d) -> Floats1d:
        # For the backward pass, the gradient is distributed over the whole row and
        # column, so pull it all in.
        out = d_pwsum.sum(axis=0) + d_pwsum.sum(axis=1)
        return out
    return pw_sum, backward
 def pairwise_product(bilinear, dropout, vecs: Floats2d, is_train):
    # A neat side effect of this is that we don't have to pass the backprops
    # around separately because the closure handles them.
    source, source_b = bilinear(vecs, is_train)
    target, target_b = dropout(vecs.T, is_train)
    pw_prod = source @ target
    def backward(d_prod: Floats2d) -> Floats2d:
        dS = source_b(d_prod @ target.T)
        dT = target_b(source.T @ d_prod)
        dX = dS + dT.T
        return dX
    return pw_prod, backward
 # XXX here down is wl-coref
 from typing import List, Tuple
 import torch
 from thinc.util import xp2torch, torch2xp
 # TODO rename this to coref_util
 from .coref_util import add_dummy
 # TODO rename to plain coref
-@registry.architectures("spacy.WLCoref.v1")
+@registry.architectures("spacy.Coref.v1")
 def build_wl_coref_model(
    tok2vec: Model[List[Doc], List[Floats2d]],
    embedding_size: int = 20,