Add option for improved NER feature extraction (#4671)

* Support option of three NER features

* Expose nr_feature parser model setting

* Give feature tokens better name

* Test nr_feature=3 for NER

* Format
This commit is contained in:
Matthew Honnibal 2019-11-19 15:03:14 +01:00 committed by GitHub
parent 5ad5c4b44a
commit 4b123952aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 49 additions and 5 deletions

View File

@ -100,10 +100,30 @@ cdef cppclass StateC:
free(this.shifted - PADDING) free(this.shifted - PADDING)
void set_context_tokens(int* ids, int n) nogil: void set_context_tokens(int* ids, int n) nogil:
if n == 2: if n == 1:
if this.B(0) >= 0:
ids[0] = this.B(0)
else:
ids[0] = -1
elif n == 2:
ids[0] = this.B(0) ids[0] = this.B(0)
ids[1] = this.S(0) ids[1] = this.S(0)
if n == 8: elif n == 3:
if this.B(0) >= 0:
ids[0] = this.B(0)
else:
ids[0] = -1
# First word of entity, if any
if this.entity_is_open():
ids[1] = this.E(0)
else:
ids[1] = -1
# Last word of entity, if within entity
if ids[0] == -1 or ids[1] == -1:
ids[2] = -1
else:
ids[2] = ids[0] - 1
elif n == 8:
ids[0] = this.B(0) ids[0] = this.B(0)
ids[1] = this.B(1) ids[1] = this.B(1)
ids[2] = this.S(0) ids[2] = this.S(0)

View File

@ -61,6 +61,7 @@ cdef class Parser:
t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3)) t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0)) bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0)) self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature)
if depth != 1: if depth != 1:
raise ValueError(TempErrors.T004.format(value=depth)) raise ValueError(TempErrors.T004.format(value=depth))
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
@ -80,7 +81,7 @@ cdef class Parser:
tok2vec = chain(tok2vec, flatten) tok2vec = chain(tok2vec, flatten)
tok2vec.nO = token_vector_width tok2vec.nO = token_vector_width
lower = PrecomputableAffine(hidden_width, lower = PrecomputableAffine(hidden_width,
nF=cls.nr_feature, nI=token_vector_width, nF=nr_feature_tokens, nI=token_vector_width,
nP=parser_maxout_pieces) nP=parser_maxout_pieces)
lower.nP = parser_maxout_pieces lower.nP = parser_maxout_pieces
@ -90,6 +91,7 @@ cdef class Parser:
cfg = { cfg = {
'nr_class': nr_class, 'nr_class': nr_class,
'nr_feature_tokens': nr_feature_tokens,
'hidden_depth': depth, 'hidden_depth': depth,
'token_vector_width': token_vector_width, 'token_vector_width': token_vector_width,
'hidden_width': hidden_width, 'hidden_width': hidden_width,
@ -133,6 +135,7 @@ cdef class Parser:
if 'beam_update_prob' not in cfg: if 'beam_update_prob' not in cfg:
cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0) cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0)
cfg.setdefault('cnn_maxout_pieces', 3) cfg.setdefault('cnn_maxout_pieces', 3)
cfg.setdefault("nr_feature_tokens", self.nr_feature)
self.cfg = cfg self.cfg = cfg
self.model = model self.model = model
self._multitasks = [] self._multitasks = []
@ -299,7 +302,7 @@ cdef class Parser:
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
dtype='i', order='C') dtype='i', order='C')
cdef int* c_ids cdef int* c_ids
cdef int nr_feature = self.nr_feature cdef int nr_feature = self.cfg["nr_feature_tokens"]
cdef int n_states cdef int n_states
model = self.model(docs) model = self.model(docs)
todo = [beam for beam in beams if not beam.is_done] todo = [beam for beam in beams if not beam.is_done]
@ -502,7 +505,7 @@ cdef class Parser:
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
model, finish_update = self.model.begin_update(docs, drop=drop) model, finish_update = self.model.begin_update(docs, drop=drop)
states_d_scores, backprops, beams = _beam_utils.update_beam( states_d_scores, backprops, beams = _beam_utils.update_beam(
self.moves, self.nr_feature, 10000, states, golds, model.state2vec, self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec,
model.vec2scores, width, drop=drop, losses=losses, model.vec2scores, width, drop=drop, losses=losses,
beam_density=beam_density) beam_density=beam_density)
for i, d_scores in enumerate(states_d_scores): for i, d_scores in enumerate(states_d_scores):

View File

@ -259,6 +259,27 @@ def test_block_ner():
assert [token.ent_type_ for token in doc] == expected_types assert [token.ent_type_ for token in doc] == expected_types
def test_change_number_features():
# Test the default number features
nlp = English()
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
ner.add_label("PERSON")
nlp.begin_training()
assert ner.model.lower.nF == ner.nr_feature
# Test we can change it
nlp = English()
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
ner.add_label("PERSON")
nlp.begin_training(
component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}}
)
assert ner.model.lower.nF == 3
# Test the model runs
doc = nlp("hello world")
class BlockerComponent1(object): class BlockerComponent1(object):
name = "my_blocker" name = "my_blocker"