mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Add option for improved NER feature extraction (#4671)
* Support option of three NER features * Expose nr_feature parser model setting * Give feature tokens better name * Test nr_feature=3 for NER * Format
This commit is contained in:
parent
5ad5c4b44a
commit
4b123952aa
|
@ -100,10 +100,30 @@ cdef cppclass StateC:
|
||||||
free(this.shifted - PADDING)
|
free(this.shifted - PADDING)
|
||||||
|
|
||||||
void set_context_tokens(int* ids, int n) nogil:
|
void set_context_tokens(int* ids, int n) nogil:
|
||||||
if n == 2:
|
if n == 1:
|
||||||
|
if this.B(0) >= 0:
|
||||||
|
ids[0] = this.B(0)
|
||||||
|
else:
|
||||||
|
ids[0] = -1
|
||||||
|
elif n == 2:
|
||||||
ids[0] = this.B(0)
|
ids[0] = this.B(0)
|
||||||
ids[1] = this.S(0)
|
ids[1] = this.S(0)
|
||||||
if n == 8:
|
elif n == 3:
|
||||||
|
if this.B(0) >= 0:
|
||||||
|
ids[0] = this.B(0)
|
||||||
|
else:
|
||||||
|
ids[0] = -1
|
||||||
|
# First word of entity, if any
|
||||||
|
if this.entity_is_open():
|
||||||
|
ids[1] = this.E(0)
|
||||||
|
else:
|
||||||
|
ids[1] = -1
|
||||||
|
# Last word of entity, if within entity
|
||||||
|
if ids[0] == -1 or ids[1] == -1:
|
||||||
|
ids[2] = -1
|
||||||
|
else:
|
||||||
|
ids[2] = ids[0] - 1
|
||||||
|
elif n == 8:
|
||||||
ids[0] = this.B(0)
|
ids[0] = this.B(0)
|
||||||
ids[1] = this.B(1)
|
ids[1] = this.B(1)
|
||||||
ids[2] = this.S(0)
|
ids[2] = this.S(0)
|
||||||
|
|
|
@ -61,6 +61,7 @@ cdef class Parser:
|
||||||
t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
|
t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3))
|
||||||
bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
|
bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0))
|
||||||
self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
|
self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0))
|
||||||
|
nr_feature_tokens = cfg.get("nr_feature_tokens", cls.nr_feature)
|
||||||
if depth != 1:
|
if depth != 1:
|
||||||
raise ValueError(TempErrors.T004.format(value=depth))
|
raise ValueError(TempErrors.T004.format(value=depth))
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
||||||
|
@ -80,7 +81,7 @@ cdef class Parser:
|
||||||
tok2vec = chain(tok2vec, flatten)
|
tok2vec = chain(tok2vec, flatten)
|
||||||
tok2vec.nO = token_vector_width
|
tok2vec.nO = token_vector_width
|
||||||
lower = PrecomputableAffine(hidden_width,
|
lower = PrecomputableAffine(hidden_width,
|
||||||
nF=cls.nr_feature, nI=token_vector_width,
|
nF=nr_feature_tokens, nI=token_vector_width,
|
||||||
nP=parser_maxout_pieces)
|
nP=parser_maxout_pieces)
|
||||||
lower.nP = parser_maxout_pieces
|
lower.nP = parser_maxout_pieces
|
||||||
|
|
||||||
|
@ -90,6 +91,7 @@ cdef class Parser:
|
||||||
|
|
||||||
cfg = {
|
cfg = {
|
||||||
'nr_class': nr_class,
|
'nr_class': nr_class,
|
||||||
|
'nr_feature_tokens': nr_feature_tokens,
|
||||||
'hidden_depth': depth,
|
'hidden_depth': depth,
|
||||||
'token_vector_width': token_vector_width,
|
'token_vector_width': token_vector_width,
|
||||||
'hidden_width': hidden_width,
|
'hidden_width': hidden_width,
|
||||||
|
@ -133,6 +135,7 @@ cdef class Parser:
|
||||||
if 'beam_update_prob' not in cfg:
|
if 'beam_update_prob' not in cfg:
|
||||||
cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0)
|
cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0)
|
||||||
cfg.setdefault('cnn_maxout_pieces', 3)
|
cfg.setdefault('cnn_maxout_pieces', 3)
|
||||||
|
cfg.setdefault("nr_feature_tokens", self.nr_feature)
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
self.model = model
|
self.model = model
|
||||||
self._multitasks = []
|
self._multitasks = []
|
||||||
|
@ -299,7 +302,7 @@ cdef class Parser:
|
||||||
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
||||||
dtype='i', order='C')
|
dtype='i', order='C')
|
||||||
cdef int* c_ids
|
cdef int* c_ids
|
||||||
cdef int nr_feature = self.nr_feature
|
cdef int nr_feature = self.cfg["nr_feature_tokens"]
|
||||||
cdef int n_states
|
cdef int n_states
|
||||||
model = self.model(docs)
|
model = self.model(docs)
|
||||||
todo = [beam for beam in beams if not beam.is_done]
|
todo = [beam for beam in beams if not beam.is_done]
|
||||||
|
@ -502,7 +505,7 @@ cdef class Parser:
|
||||||
self.moves.preprocess_gold(gold)
|
self.moves.preprocess_gold(gold)
|
||||||
model, finish_update = self.model.begin_update(docs, drop=drop)
|
model, finish_update = self.model.begin_update(docs, drop=drop)
|
||||||
states_d_scores, backprops, beams = _beam_utils.update_beam(
|
states_d_scores, backprops, beams = _beam_utils.update_beam(
|
||||||
self.moves, self.nr_feature, 10000, states, golds, model.state2vec,
|
self.moves, self.cfg["nr_feature_tokens"], 10000, states, golds, model.state2vec,
|
||||||
model.vec2scores, width, drop=drop, losses=losses,
|
model.vec2scores, width, drop=drop, losses=losses,
|
||||||
beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
for i, d_scores in enumerate(states_d_scores):
|
for i, d_scores in enumerate(states_d_scores):
|
||||||
|
|
|
@ -259,6 +259,27 @@ def test_block_ner():
|
||||||
assert [token.ent_type_ for token in doc] == expected_types
|
assert [token.ent_type_ for token in doc] == expected_types
|
||||||
|
|
||||||
|
|
||||||
|
def test_change_number_features():
|
||||||
|
# Test the default number features
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
ner.add_label("PERSON")
|
||||||
|
nlp.begin_training()
|
||||||
|
assert ner.model.lower.nF == ner.nr_feature
|
||||||
|
# Test we can change it
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
ner.add_label("PERSON")
|
||||||
|
nlp.begin_training(
|
||||||
|
component_cfg={"ner": {"nr_feature_tokens": 3, "token_vector_width": 128}}
|
||||||
|
)
|
||||||
|
assert ner.model.lower.nF == 3
|
||||||
|
# Test the model runs
|
||||||
|
doc = nlp("hello world")
|
||||||
|
|
||||||
|
|
||||||
class BlockerComponent1(object):
|
class BlockerComponent1(object):
|
||||||
name = "my_blocker"
|
name = "my_blocker"
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user