spaCy/spacy/ml/_iob.py
Matthew Honnibal 333b1a308b
Adapt parser and NER for transformers (#5449)
* Draft layer for BILUO actions

* Fixes to biluo layer

* WIP on BILUO layer

* Add tests for BILUO layer

* Format

* Fix transitions

* Update test

* Link in the simple_ner

* Update BILUO tagger

* Update __init__

* Import simple_ner

* Update test

* Import

* Add files

* Add config

* Fix label passing for BILUO and tagger

* Fix label handling for simple_ner component

* Update simple NER test

* Update config

* Hack train script

* Update BILUO layer

* Fix SimpleNER component

* Update train_from_config

* Add biluo_to_iob helper

* Add IOB layer

* Add IOBTagger model

* Update biluo layer

* Update SimpleNER tagger

* Update BILUO

* Read random seed in train-from-config

* Update use of normal_init

* Fix normalization of gradient in SimpleNER

* Update IOBTagger

* Remove print

* Tweak masking in BILUO

* Add dropout in SimpleNER

* Update thinc

* Tidy up simple_ner

* Fix biluo model

* Unhack train-from-config

* Update setup.cfg and requirements

* Add tb_framework.py for parser model

* Try to avoid memory leak in BILUO

* Move ParserModel into spacy.ml, avoid need for subclass.

* Use updated parser model

* Remove incorrect call to model.initializre in PrecomputableAffine

* Update parser model

* Avoid divide by zero in tagger

* Add extra dropout layer in tagger

* Refine minibatch_by_words function to avoid oom

* Fix parser model after refactor

* Try to avoid div-by-zero in SimpleNER

* Fix infinite loop in minibatch_by_words

* Use SequenceCategoricalCrossentropy in Tagger

* Fix parser model when hidden layer

* Remove extra dropout from tagger

* Add extra nan check in tagger

* Fix thinc version

* Update tests and imports

* Fix test

* Update test

* Update tests

* Fix tests

* Fix test

Co-authored-by: Ines Montani <ines@ines.io>
2020-05-18 22:23:33 +02:00

93 lines
3.3 KiB
Python

"""Thinc layer to do simpler transition-based parsing, NER, etc."""
from typing import List, Tuple, Dict, Optional
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
from ..tokens import Doc
def IOB() -> Model[Padded, Padded]:
return Model(
"biluo",
forward,
init=init,
dims={"nO": None},
attrs={"get_num_actions": get_num_actions}
)
def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None):
if X is not None and Y is not None:
if X.data.shape != Y.data.shape:
# TODO: Fix error
raise ValueError("Mismatched shapes (TODO: Fix message)")
model.set_dim("nO", X.data.shape[2])
elif X is not None:
model.set_dim("nO", X.data.shape[2])
elif Y is not None:
model.set_dim("nO", Y.data.shape[2])
elif model.get_dim("nO") is None:
raise ValueError("Dimension unset for BILUO: nO")
def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
n_labels = (model.get_dim("nO") - 1) // 2
n_tokens, n_docs, n_actions = Xp.data.shape
# At each timestep, we make a validity mask of shape (n_docs, n_actions)
# to indicate which actions are valid next for each sequence. To construct
# the mask, we have a state of shape (2, n_actions) and a validity table of
# shape (2, n_actions+1, n_actions). The first dimension of the state indicates
# whether it's the last token, the second dimension indicates the previous
# action, plus a special 'null action' for the first entry.
valid_transitions = _get_transition_table(model.ops, n_labels)
prev_actions = model.ops.alloc1i(n_docs)
# Initialize as though prev action was O
prev_actions.fill(n_actions - 1)
Y = model.ops.alloc3f(*Xp.data.shape)
masks = model.ops.alloc3f(*Y.shape)
for t in range(Xp.data.shape[0]):
masks[t] = valid_transitions[prev_actions]
# Don't train the out-of-bounds sequences.
masks[t, Xp.size_at_t[t]:] = 0
# Valid actions get 0*10e8, invalid get -1*10e8
Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8)
prev_actions = Y[t].argmax(axis=-1)
def backprop_biluo(dY: Padded) -> Padded:
# Masking the gradient seems to do poorly here. But why?
#dY.data *= masks
return dY
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
def get_num_actions(n_labels: int) -> int:
# One BEGIN action per label
# One IN action per label
# One LAST action per label
# One UNIT action per label
# One OUT action
return n_labels * 2 + 1
def _get_transition_table(
ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {}
) -> Floats3d:
n_actions = get_num_actions(n_labels)
if n_actions in _cache:
return ops.asarray(_cache[n_actions])
table = ops.alloc2f(n_actions, n_actions)
B_start, B_end = (0, n_labels)
I_start, I_end = (B_end, B_end + n_labels)
O_action = I_end
B_range = ops.xp.arange(B_start, B_end)
I_range = ops.xp.arange(I_start, I_end)
# B and O are always valid
table[:, B_start : B_end] = 1
table[:, O_action] = 1
# I can only follow a matching B
table[B_range, I_range] = 1
_cache[n_actions] = table
return table