mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Support negative examples in partial NER annotations (#8106)
* Support a cfg field in transition system * Make NER 'has gold' check use right alignment for span * Pass 'negative_samples_key' property into NER transition system * Add field for negative samples to NER transition system * Check neg_key in NER has_gold * Support negative examples in NER oracle * Test for negative examples in NER * Fix name of config variable in NER * Remove vestiges of old-style partial annotation * Remove obsolete tests * Add comment noting lack of support for negative samples in parser * Additions to "neg examples" PR (#8201) * add custom error and test for deprecated format * add test for unlearning an entity * add break also for Begin's cost * add negative_samples_key property on Parser * rename * extend docs & fix some older docs issues * add subclass constructors, clean up tests, fix docs * add flaky test with ValueError if gold parse was not found * remove ValueError if n_gold == 0 * fix docstring * Hack in environment variables to try out training * Remove hack * Remove NER hack, and support 'negative O' samples * Fix O oracle * Fix transition parser * Remove 'not O' from oracle * Fix NER oracle * check for spans in both gold.ents and gold.spans and raise if so, to prevent memory access violation * use set instead of list in consistency check Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
02bac8f269
commit
6f5e308d17
|
@ -521,6 +521,13 @@ class Errors:
|
||||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E868 = ("Found a conflicting gold annotation in a reference document, "
|
||||||
|
"with the following char-based span occurring both in the gold ents "
|
||||||
|
"as well as in the negative spans: {span}.")
|
||||||
|
E869 = ("The notation '{label}' is not supported anymore. To annotate "
|
||||||
|
"negative NER samples, use `doc.spans[key]` instead, and "
|
||||||
|
"specify the key as 'incorrect_spans_key' when constructing "
|
||||||
|
"the NER component.")
|
||||||
E870 = ("Could not serialize the DocBin because it is too large. Consider "
|
E870 = ("Could not serialize the DocBin because it is too large. Consider "
|
||||||
"splitting up your documents into several doc bins and serializing "
|
"splitting up your documents into several doc bins and serializing "
|
||||||
"each separately. spacy.Corpus.v1 will search recursively for all "
|
"each separately. spacy.Corpus.v1 will search recursively for all "
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import os
|
||||||
|
import random
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
@ -6,10 +8,11 @@ from thinc.extra.search cimport Beam
|
||||||
|
|
||||||
from ...tokens.doc cimport Doc
|
from ...tokens.doc cimport Doc
|
||||||
from ...tokens.span import Span
|
from ...tokens.span import Span
|
||||||
|
from ...tokens.span cimport Span
|
||||||
from ...typedefs cimport weight_t, attr_t
|
from ...typedefs cimport weight_t, attr_t
|
||||||
from ...lexeme cimport Lexeme
|
from ...lexeme cimport Lexeme
|
||||||
from ...attrs cimport IS_SPACE
|
from ...attrs cimport IS_SPACE
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC, SpanC
|
||||||
from ...training.example cimport Example
|
from ...training.example cimport Example
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
@ -25,7 +28,6 @@ cdef enum:
|
||||||
LAST
|
LAST
|
||||||
UNIT
|
UNIT
|
||||||
OUT
|
OUT
|
||||||
ISNT
|
|
||||||
N_MOVES
|
N_MOVES
|
||||||
|
|
||||||
|
|
||||||
|
@ -36,39 +38,62 @@ MOVE_NAMES[IN] = 'I'
|
||||||
MOVE_NAMES[LAST] = 'L'
|
MOVE_NAMES[LAST] = 'L'
|
||||||
MOVE_NAMES[UNIT] = 'U'
|
MOVE_NAMES[UNIT] = 'U'
|
||||||
MOVE_NAMES[OUT] = 'O'
|
MOVE_NAMES[OUT] = 'O'
|
||||||
MOVE_NAMES[ISNT] = 'x'
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct GoldNERStateC:
|
cdef struct GoldNERStateC:
|
||||||
Transition* ner
|
Transition* ner
|
||||||
|
SpanC* negs
|
||||||
int32_t length
|
int32_t length
|
||||||
|
int32_t nr_neg
|
||||||
|
|
||||||
|
|
||||||
cdef class BiluoGold:
|
cdef class BiluoGold:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef GoldNERStateC c
|
cdef GoldNERStateC c
|
||||||
|
|
||||||
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example):
|
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.c = create_gold_state(self.mem, moves, stcls.c, example)
|
self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key)
|
||||||
|
|
||||||
def update(self, StateClass stcls):
|
def update(self, StateClass stcls):
|
||||||
update_gold_state(&self.c, stcls.c)
|
update_gold_state(&self.c, stcls.c)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef GoldNERStateC create_gold_state(
|
cdef GoldNERStateC create_gold_state(
|
||||||
Pool mem,
|
Pool mem,
|
||||||
BiluoPushDown moves,
|
BiluoPushDown moves,
|
||||||
const StateC* stcls,
|
const StateC* stcls,
|
||||||
Example example
|
Example example,
|
||||||
|
neg_key
|
||||||
) except *:
|
) except *:
|
||||||
cdef GoldNERStateC gs
|
cdef GoldNERStateC gs
|
||||||
|
cdef Span neg
|
||||||
|
if neg_key is not None:
|
||||||
|
negs = example.get_aligned_spans_y2x(
|
||||||
|
example.y.spans.get(neg_key, []),
|
||||||
|
allow_overlap=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
negs = []
|
||||||
assert example.x.length > 0
|
assert example.x.length > 0
|
||||||
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
|
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
|
||||||
ner_tags = example.get_aligned_ner()
|
gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
|
||||||
|
gs.nr_neg = len(negs)
|
||||||
|
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
|
||||||
for i, ner_tag in enumerate(ner_tags):
|
for i, ner_tag in enumerate(ner_tags):
|
||||||
gs.ner[i] = moves.lookup_transition(ner_tag)
|
gs.ner[i] = moves.lookup_transition(ner_tag)
|
||||||
|
|
||||||
|
# Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label.
|
||||||
|
neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs}
|
||||||
|
for pos_span in ner_ents:
|
||||||
|
if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples:
|
||||||
|
raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_)))
|
||||||
|
|
||||||
|
# In order to handle negative samples, we need to maintain the full
|
||||||
|
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
|
||||||
|
# thing, we'll get blocked if there's an incorrect prefix.
|
||||||
|
for i, neg in enumerate(negs):
|
||||||
|
gs.negs[i] = neg.c
|
||||||
return gs
|
return gs
|
||||||
|
|
||||||
|
|
||||||
|
@ -156,21 +181,16 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
cdef attr_t label
|
cdef attr_t label
|
||||||
if name == '-' or name == '' or name is None:
|
if name == '-' or name == '' or name is None:
|
||||||
return Transition(clas=0, move=MISSING, label=0, score=0)
|
return Transition(clas=0, move=MISSING, label=0, score=0)
|
||||||
elif name == '!O':
|
|
||||||
return Transition(clas=0, move=ISNT, label=0, score=0)
|
|
||||||
elif '-' in name:
|
elif '-' in name:
|
||||||
move_str, label_str = name.split('-', 1)
|
move_str, label_str = name.split('-', 1)
|
||||||
# Hacky way to denote 'not this entity'
|
# Deprecated, hacky way to denote 'not this entity'
|
||||||
if label_str.startswith('!'):
|
if label_str.startswith('!'):
|
||||||
label_str = label_str[1:]
|
raise ValueError(Errors.E869.format(label=name))
|
||||||
move_str = 'x'
|
|
||||||
label = self.strings.add(label_str)
|
label = self.strings.add(label_str)
|
||||||
else:
|
else:
|
||||||
move_str = name
|
move_str = name
|
||||||
label = 0
|
label = 0
|
||||||
move = MOVE_NAMES.index(move_str)
|
move = MOVE_NAMES.index(move_str)
|
||||||
if move == ISNT:
|
|
||||||
return Transition(clas=0, move=ISNT, label=label, score=0)
|
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if self.c[i].move == move and self.c[i].label == label:
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
return self.c[i]
|
return self.c[i]
|
||||||
|
@ -220,7 +240,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
label_id = label_name
|
label_id = label_name
|
||||||
if action == OUT and label_id != 0:
|
if action == OUT and label_id != 0:
|
||||||
return None
|
return None
|
||||||
if action == MISSING or action == ISNT:
|
if action == MISSING:
|
||||||
return None
|
return None
|
||||||
# Check we're not creating a move we already have, so that this is
|
# Check we're not creating a move we already have, so that this is
|
||||||
# idempotent
|
# idempotent
|
||||||
|
@ -270,9 +290,23 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
return parses
|
return parses
|
||||||
|
|
||||||
def init_gold(self, StateClass state, Example example):
|
def init_gold(self, StateClass state, Example example):
|
||||||
return BiluoGold(self, state, example)
|
return BiluoGold(self, state, example, self.neg_key)
|
||||||
|
|
||||||
def has_gold(self, Example eg, start=0, end=None):
|
def has_gold(self, Example eg, start=0, end=None):
|
||||||
|
# We get x and y referring to X, we want to check relative to Y,
|
||||||
|
# the reference
|
||||||
|
y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]])
|
||||||
|
if not y_spans:
|
||||||
|
y_spans = [eg.y[:]]
|
||||||
|
y_span = y_spans[0]
|
||||||
|
start = y_span.start
|
||||||
|
end = y_span.end
|
||||||
|
neg_key = self.neg_key
|
||||||
|
if neg_key is not None:
|
||||||
|
# If we have any negative samples, count that as having annotation.
|
||||||
|
for span in eg.y.spans.get(neg_key, []):
|
||||||
|
if span.start >= start and span.end <= end:
|
||||||
|
return True
|
||||||
for word in eg.y[start:end]:
|
for word in eg.y[start:end]:
|
||||||
if word.ent_iob != 0:
|
if word.ent_iob != 0:
|
||||||
return True
|
return True
|
||||||
|
@ -306,8 +340,6 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
n_gold += costs[i] <= 0
|
n_gold += costs[i] <= 0
|
||||||
else:
|
else:
|
||||||
costs[i] = 9000
|
costs[i] = 9000
|
||||||
if n_gold < 1:
|
|
||||||
raise ValueError
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Missing:
|
cdef class Missing:
|
||||||
|
@ -373,23 +405,33 @@ cdef class Begin:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
b0 = s.B(0)
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
cdef int cost = 0
|
||||||
|
cdef int g_act = gold.ner[b0].move
|
||||||
|
cdef attr_t g_tag = gold.ner[b0].label
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
pass
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# B, Gold B --> Label match
|
# B, Gold B --> Label match
|
||||||
return label != g_tag
|
cost += label != g_tag
|
||||||
# Support partial supervision in the form of "not this label"
|
|
||||||
elif g_act == ISNT:
|
|
||||||
return label == g_tag
|
|
||||||
else:
|
else:
|
||||||
# B, Gold I --> False (P)
|
# B, Gold I --> False (P)
|
||||||
# B, Gold L --> False (P)
|
# B, Gold L --> False (P)
|
||||||
# B, Gold O --> False (P)
|
# B, Gold O --> False (P)
|
||||||
# B, Gold U --> False (P)
|
# B, Gold U --> False (P)
|
||||||
return 1
|
cost += 1
|
||||||
|
if s.buffer_length() < 3:
|
||||||
|
# Handle negatives. In general we can't really do much to block
|
||||||
|
# B, because we don't know whether the whole entity is going to
|
||||||
|
# be correct or not. However, we can at least tell whether we're
|
||||||
|
# going to be opening an entity where there's only one possible
|
||||||
|
# L.
|
||||||
|
for span in gold.negs[:gold.nr_neg]:
|
||||||
|
if span.label == label and span.start == b0:
|
||||||
|
cost += 1
|
||||||
|
break
|
||||||
|
return cost
|
||||||
|
|
||||||
|
|
||||||
cdef class In:
|
cdef class In:
|
||||||
|
@ -462,9 +504,6 @@ cdef class In:
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# I, Gold U --> True iff next tag == O
|
# I, Gold U --> True iff next tag == O
|
||||||
return next_act != OUT
|
return next_act != OUT
|
||||||
# Support partial supervision in the form of "not this label"
|
|
||||||
elif g_act == ISNT:
|
|
||||||
return 0
|
|
||||||
else:
|
else:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
@ -504,32 +543,41 @@ cdef class Last:
|
||||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
move = LAST
|
move = LAST
|
||||||
|
b0 = s.B(0)
|
||||||
|
ent_start = s.E(0)
|
||||||
|
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[b0].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[b0].label
|
||||||
|
|
||||||
|
cdef int cost = 0
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
pass
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# L, Gold B --> True
|
# L, Gold B --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == IN:
|
elif g_act == IN:
|
||||||
# L, Gold I --> True iff this entity sunk
|
# L, Gold I --> True iff this entity sunk
|
||||||
return not _entity_is_sunk(s, gold.ner)
|
cost += not _entity_is_sunk(s, gold.ner)
|
||||||
elif g_act == LAST:
|
elif g_act == LAST:
|
||||||
# L, Gold L --> True
|
# L, Gold L --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == OUT:
|
elif g_act == OUT:
|
||||||
# L, Gold O --> True
|
# L, Gold O --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# L, Gold U --> True
|
# L, Gold U --> True
|
||||||
return 0
|
pass
|
||||||
# Support partial supervision in the form of "not this label"
|
|
||||||
elif g_act == ISNT:
|
|
||||||
return 0
|
|
||||||
else:
|
else:
|
||||||
return 1
|
cost += 1
|
||||||
|
# If we have negative-example entities, integrate them into the objective,
|
||||||
|
# by marking actions that close an entity that we know is incorrect
|
||||||
|
# as costly.
|
||||||
|
for span in gold.negs[:gold.nr_neg]:
|
||||||
|
if span.label == label and (span.end-1) == b0 and span.start == ent_start:
|
||||||
|
cost += 1
|
||||||
|
break
|
||||||
|
return cost
|
||||||
|
|
||||||
|
|
||||||
cdef class Unit:
|
cdef class Unit:
|
||||||
|
@ -573,21 +621,29 @@ cdef class Unit:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
cdef int cost = 0
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
pass
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# U, Gold U --> True iff tag match
|
# U, Gold U --> True iff tag match
|
||||||
return label != g_tag
|
cost += label != g_tag
|
||||||
# Support partial supervision in the form of "not this label"
|
|
||||||
elif g_act == ISNT:
|
|
||||||
return label == g_tag
|
|
||||||
else:
|
else:
|
||||||
# U, Gold B --> False
|
# U, Gold B --> False
|
||||||
# U, Gold I --> False
|
# U, Gold I --> False
|
||||||
# U, Gold L --> False
|
# U, Gold L --> False
|
||||||
# U, Gold O --> False
|
# U, Gold O --> False
|
||||||
return 1
|
cost += 1
|
||||||
|
# If we have negative-example entities, integrate them into the objective.
|
||||||
|
# This is fairly straight-forward for U- entities, as we have a single
|
||||||
|
# action
|
||||||
|
cdef int b0 = s.B(0)
|
||||||
|
for span in gold.negs[:gold.nr_neg]:
|
||||||
|
if span.label == label and span.start == b0 and span.end == (b0+1):
|
||||||
|
cost += 1
|
||||||
|
break
|
||||||
|
return cost
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Out:
|
cdef class Out:
|
||||||
|
@ -613,25 +669,24 @@ cdef class Out:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
cdef weight_t cost = 0
|
||||||
if g_act == ISNT and g_tag == 0:
|
if g_act == MISSING:
|
||||||
return 1
|
pass
|
||||||
elif g_act == MISSING or g_act == ISNT:
|
|
||||||
return 0
|
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# O, Gold B --> False
|
# O, Gold B --> False
|
||||||
return 1
|
cost += 1
|
||||||
elif g_act == IN:
|
elif g_act == IN:
|
||||||
# O, Gold I --> True
|
# O, Gold I --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == LAST:
|
elif g_act == LAST:
|
||||||
# O, Gold L --> True
|
# O, Gold L --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == OUT:
|
elif g_act == OUT:
|
||||||
# O, Gold O --> True
|
# O, Gold O --> True
|
||||||
return 0
|
pass
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# O, Gold U --> False
|
# O, Gold U --> False
|
||||||
return 1
|
cost += 1
|
||||||
else:
|
else:
|
||||||
return 1
|
cost += 1
|
||||||
|
return cost
|
||||||
|
|
|
@ -41,6 +41,7 @@ cdef class TransitionSystem:
|
||||||
cdef public attr_t root_label
|
cdef public attr_t root_label
|
||||||
cdef public freqs
|
cdef public freqs
|
||||||
cdef public object labels
|
cdef public object labels
|
||||||
|
cdef public object cfg
|
||||||
cdef init_state_t init_beam_state
|
cdef init_state_t init_beam_state
|
||||||
cdef del_state_t del_beam_state
|
cdef del_state_t del_beam_state
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,14 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1:
|
||||||
|
|
||||||
|
|
||||||
cdef class TransitionSystem:
|
cdef class TransitionSystem:
|
||||||
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
StringStore string_table,
|
||||||
|
labels_by_action=None,
|
||||||
|
min_freq=None,
|
||||||
|
incorrect_spans_key=None
|
||||||
|
):
|
||||||
|
self.cfg = {"neg_key": incorrect_spans_key}
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = string_table
|
self.strings = string_table
|
||||||
self.n_moves = 0
|
self.n_moves = 0
|
||||||
|
@ -49,8 +56,13 @@ cdef class TransitionSystem:
|
||||||
self.del_beam_state = _del_state
|
self.del_beam_state = _del_state
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
# TODO: This loses the 'cfg'
|
||||||
return (self.__class__, (self.strings, self.labels), None, None)
|
return (self.__class__, (self.strings, self.labels), None, None)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def neg_key(self):
|
||||||
|
return self.cfg.get("neg_key")
|
||||||
|
|
||||||
def init_batch(self, docs):
|
def init_batch(self, docs):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
states = []
|
states = []
|
||||||
|
@ -220,16 +232,21 @@ cdef class TransitionSystem:
|
||||||
transitions = []
|
transitions = []
|
||||||
serializers = {
|
serializers = {
|
||||||
'moves': lambda: srsly.json_dumps(self.labels),
|
'moves': lambda: srsly.json_dumps(self.labels),
|
||||||
'strings': lambda: self.strings.to_bytes()
|
'strings': lambda: self.strings.to_bytes(),
|
||||||
|
'cfg': lambda: self.cfg
|
||||||
}
|
}
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple()):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
|
# We're adding a new field, 'cfg', here and we don't want to break
|
||||||
|
# previous models that don't have it.
|
||||||
|
msg = srsly.msgpack_loads(bytes_data)
|
||||||
labels = {}
|
labels = {}
|
||||||
deserializers = {
|
if 'moves' not in exclude:
|
||||||
'moves': lambda b: labels.update(srsly.json_loads(b)),
|
labels.update(srsly.json_loads(msg['moves']))
|
||||||
'strings': lambda b: self.strings.from_bytes(b)
|
if 'strings' not in exclude:
|
||||||
}
|
self.strings.from_bytes(msg['strings'])
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
if 'cfg' not in exclude and 'cfg' in msg:
|
||||||
|
self.cfg.update(msg['cfg'])
|
||||||
self.initialize_actions(labels)
|
self.initialize_actions(labels)
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -3,6 +3,7 @@ from collections import defaultdict
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable
|
||||||
from thinc.api import Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
from .transition_parser cimport Parser
|
from .transition_parser cimport Parser
|
||||||
from ._parser_internals.arc_eager cimport ArcEager
|
from ._parser_internals.arc_eager cimport ArcEager
|
||||||
|
|
||||||
|
@ -59,7 +60,7 @@ def make_parser(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
model: Model,
|
model: Model,
|
||||||
moves: Optional[list],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
learn_tokens: bool,
|
learn_tokens: bool,
|
||||||
min_action_freq: int
|
min_action_freq: int
|
||||||
|
@ -85,13 +86,13 @@ def make_parser(
|
||||||
model (Model): The model for the transition-based parser. The model needs
|
model (Model): The model for the transition-based parser. The model needs
|
||||||
to have a specific substructure of named components --- see the
|
to have a specific substructure of named components --- see the
|
||||||
spacy.ml.tb_framework.TransitionModel for details.
|
spacy.ml.tb_framework.TransitionModel for details.
|
||||||
moves (List[str]): A list of transition names. Inferred from the data if not
|
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
|
||||||
provided.
|
updated and evaluated. If 'moves' is None, a new instance is
|
||||||
update_with_oracle_cut_size (int):
|
created with `self.TransitionSystem()`. Defaults to `None`.
|
||||||
During training, cut long sequences into shorter segments by creating
|
update_with_oracle_cut_size (int): During training, cut long sequences into
|
||||||
intermediate states based on the gold-standard history. The model is
|
shorter segments by creating intermediate states based on the gold-standard
|
||||||
not very sensitive to this parameter, so you usually won't need to change
|
history. The model is not very sensitive to this parameter, so you usually
|
||||||
it. 100 is a good default.
|
won't need to change it. 100 is a good default.
|
||||||
learn_tokens (bool): Whether to learn to merge subtokens that are split
|
learn_tokens (bool): Whether to learn to merge subtokens that are split
|
||||||
relative to the gold standard. Experimental.
|
relative to the gold standard. Experimental.
|
||||||
min_action_freq (int): The minimum frequency of labelled actions to retain.
|
min_action_freq (int): The minimum frequency of labelled actions to retain.
|
||||||
|
@ -112,6 +113,9 @@ def make_parser(
|
||||||
beam_width=1,
|
beam_width=1,
|
||||||
beam_density=0.0,
|
beam_density=0.0,
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
|
# At some point in the future we can try to implement support for
|
||||||
|
# partial annotations, perhaps only in the beam objective.
|
||||||
|
incorrect_spans_key=None
|
||||||
)
|
)
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
|
@ -140,7 +144,7 @@ def make_beam_parser(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
model: Model,
|
model: Model,
|
||||||
moves: Optional[list],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
learn_tokens: bool,
|
learn_tokens: bool,
|
||||||
min_action_freq: int,
|
min_action_freq: int,
|
||||||
|
@ -165,8 +169,13 @@ def make_beam_parser(
|
||||||
model (Model): The model for the transition-based parser. The model needs
|
model (Model): The model for the transition-based parser. The model needs
|
||||||
to have a specific substructure of named components --- see the
|
to have a specific substructure of named components --- see the
|
||||||
spacy.ml.tb_framework.TransitionModel for details.
|
spacy.ml.tb_framework.TransitionModel for details.
|
||||||
moves (List[str]): A list of transition names. Inferred from the data if not
|
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
|
||||||
provided.
|
updated and evaluated. If 'moves' is None, a new instance is
|
||||||
|
created with `self.TransitionSystem()`. Defaults to `None`.
|
||||||
|
update_with_oracle_cut_size (int): During training, cut long sequences into
|
||||||
|
shorter segments by creating intermediate states based on the gold-standard
|
||||||
|
history. The model is not very sensitive to this parameter, so you usually
|
||||||
|
won't need to change it. 100 is a good default.
|
||||||
beam_width (int): The number of candidate analyses to maintain.
|
beam_width (int): The number of candidate analyses to maintain.
|
||||||
beam_density (float): The minimum ratio between the scores of the first and
|
beam_density (float): The minimum ratio between the scores of the first and
|
||||||
last candidates in the beam. This allows the parser to avoid exploring
|
last candidates in the beam. This allows the parser to avoid exploring
|
||||||
|
@ -195,7 +204,10 @@ def make_beam_parser(
|
||||||
beam_update_prob=beam_update_prob,
|
beam_update_prob=beam_update_prob,
|
||||||
multitasks=[],
|
multitasks=[],
|
||||||
learn_tokens=learn_tokens,
|
learn_tokens=learn_tokens,
|
||||||
min_action_freq=min_action_freq
|
min_action_freq=min_action_freq,
|
||||||
|
# At some point in the future we can try to implement support for
|
||||||
|
# partial annotations, perhaps only in the beam objective.
|
||||||
|
incorrect_spans_key=None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -206,6 +218,39 @@ cdef class DependencyParser(Parser):
|
||||||
"""
|
"""
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab,
|
||||||
|
model,
|
||||||
|
name="parser",
|
||||||
|
moves=None,
|
||||||
|
*,
|
||||||
|
update_with_oracle_cut_size=100,
|
||||||
|
min_action_freq=30,
|
||||||
|
learn_tokens=False,
|
||||||
|
beam_width=1,
|
||||||
|
beam_density=0.0,
|
||||||
|
beam_update_prob=0.0,
|
||||||
|
multitasks=tuple(),
|
||||||
|
incorrect_spans_key=None,
|
||||||
|
):
|
||||||
|
"""Create a DependencyParser.
|
||||||
|
"""
|
||||||
|
super().__init__(
|
||||||
|
vocab,
|
||||||
|
model,
|
||||||
|
name,
|
||||||
|
moves,
|
||||||
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||||
|
min_action_freq=min_action_freq,
|
||||||
|
learn_tokens=learn_tokens,
|
||||||
|
beam_width=beam_width,
|
||||||
|
beam_density=beam_density,
|
||||||
|
beam_update_prob=beam_update_prob,
|
||||||
|
multitasks=multitasks,
|
||||||
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def postprocesses(self):
|
def postprocesses(self):
|
||||||
output = [nonproj.deprojectivize]
|
output = [nonproj.deprojectivize]
|
||||||
|
|
|
@ -3,6 +3,7 @@ from collections import defaultdict
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable
|
||||||
from thinc.api import Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
from .transition_parser cimport Parser
|
from .transition_parser cimport Parser
|
||||||
from ._parser_internals.ner cimport BiluoPushDown
|
from ._parser_internals.ner cimport BiluoPushDown
|
||||||
|
|
||||||
|
@ -40,6 +41,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"moves": None,
|
"moves": None,
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
"model": DEFAULT_NER_MODEL,
|
"model": DEFAULT_NER_MODEL,
|
||||||
|
"incorrect_spans_key": None
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||||
|
|
||||||
|
@ -48,8 +50,9 @@ def make_ner(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
model: Model,
|
model: Model,
|
||||||
moves: Optional[list],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
|
incorrect_spans_key: Optional[str]=None
|
||||||
):
|
):
|
||||||
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
||||||
identifies non-overlapping labelled spans of tokens.
|
identifies non-overlapping labelled spans of tokens.
|
||||||
|
@ -67,13 +70,16 @@ def make_ner(
|
||||||
model (Model): The model for the transition-based parser. The model needs
|
model (Model): The model for the transition-based parser. The model needs
|
||||||
to have a specific substructure of named components --- see the
|
to have a specific substructure of named components --- see the
|
||||||
spacy.ml.tb_framework.TransitionModel for details.
|
spacy.ml.tb_framework.TransitionModel for details.
|
||||||
moves (list[str]): A list of transition names. Inferred from the data if not
|
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
|
||||||
provided.
|
updated and evaluated. If 'moves' is None, a new instance is
|
||||||
update_with_oracle_cut_size (int):
|
created with `self.TransitionSystem()`. Defaults to `None`.
|
||||||
During training, cut long sequences into shorter segments by creating
|
update_with_oracle_cut_size (int): During training, cut long sequences into
|
||||||
intermediate states based on the gold-standard history. The model is
|
shorter segments by creating intermediate states based on the gold-standard
|
||||||
not very sensitive to this parameter, so you usually won't need to change
|
history. The model is not very sensitive to this parameter, so you usually
|
||||||
it. 100 is a good default.
|
won't need to change it. 100 is a good default.
|
||||||
|
incorrect_spans_key (Optional[str]): Identifies spans that are known
|
||||||
|
to be incorrect entity annotations. The incorrect entity annotations
|
||||||
|
can be stored in the span group, under this key.
|
||||||
"""
|
"""
|
||||||
return EntityRecognizer(
|
return EntityRecognizer(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
|
@ -81,9 +87,8 @@ def make_ner(
|
||||||
name,
|
name,
|
||||||
moves=moves,
|
moves=moves,
|
||||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||||
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
multitasks=[],
|
multitasks=[],
|
||||||
min_action_freq=1,
|
|
||||||
learn_tokens=False,
|
|
||||||
beam_width=1,
|
beam_width=1,
|
||||||
beam_density=0.0,
|
beam_density=0.0,
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
|
@ -98,7 +103,8 @@ def make_ner(
|
||||||
"model": DEFAULT_NER_MODEL,
|
"model": DEFAULT_NER_MODEL,
|
||||||
"beam_density": 0.01,
|
"beam_density": 0.01,
|
||||||
"beam_update_prob": 0.5,
|
"beam_update_prob": 0.5,
|
||||||
"beam_width": 32
|
"beam_width": 32,
|
||||||
|
"incorrect_spans_key": None
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||||
)
|
)
|
||||||
|
@ -106,11 +112,12 @@ def make_beam_ner(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
model: Model,
|
model: Model,
|
||||||
moves: Optional[list],
|
moves: Optional[TransitionSystem],
|
||||||
update_with_oracle_cut_size: int,
|
update_with_oracle_cut_size: int,
|
||||||
beam_width: int,
|
beam_width: int,
|
||||||
beam_density: float,
|
beam_density: float,
|
||||||
beam_update_prob: float,
|
beam_update_prob: float,
|
||||||
|
incorrect_spans_key: Optional[str]=None
|
||||||
):
|
):
|
||||||
"""Create a transition-based EntityRecognizer component that uses beam-search.
|
"""Create a transition-based EntityRecognizer component that uses beam-search.
|
||||||
The entity recognizer identifies non-overlapping labelled spans of tokens.
|
The entity recognizer identifies non-overlapping labelled spans of tokens.
|
||||||
|
@ -128,13 +135,13 @@ def make_beam_ner(
|
||||||
model (Model): The model for the transition-based parser. The model needs
|
model (Model): The model for the transition-based parser. The model needs
|
||||||
to have a specific substructure of named components --- see the
|
to have a specific substructure of named components --- see the
|
||||||
spacy.ml.tb_framework.TransitionModel for details.
|
spacy.ml.tb_framework.TransitionModel for details.
|
||||||
moves (list[str]): A list of transition names. Inferred from the data if not
|
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
|
||||||
provided.
|
updated and evaluated. If 'moves' is None, a new instance is
|
||||||
update_with_oracle_cut_size (int):
|
created with `self.TransitionSystem()`. Defaults to `None`.
|
||||||
During training, cut long sequences into shorter segments by creating
|
update_with_oracle_cut_size (int): During training, cut long sequences into
|
||||||
intermediate states based on the gold-standard history. The model is
|
shorter segments by creating intermediate states based on the gold-standard
|
||||||
not very sensitive to this parameter, so you usually won't need to change
|
history. The model is not very sensitive to this parameter, so you usually
|
||||||
it. 100 is a good default.
|
won't need to change it. 100 is a good default.
|
||||||
beam_width (int): The number of candidate analyses to maintain.
|
beam_width (int): The number of candidate analyses to maintain.
|
||||||
beam_density (float): The minimum ratio between the scores of the first and
|
beam_density (float): The minimum ratio between the scores of the first and
|
||||||
last candidates in the beam. This allows the parser to avoid exploring
|
last candidates in the beam. This allows the parser to avoid exploring
|
||||||
|
@ -144,6 +151,8 @@ def make_beam_ner(
|
||||||
beam_update_prob (float): The chance of making a beam update, instead of a
|
beam_update_prob (float): The chance of making a beam update, instead of a
|
||||||
greedy update. Greedy updates are an approximation for the beam updates,
|
greedy update. Greedy updates are an approximation for the beam updates,
|
||||||
and are faster to compute.
|
and are faster to compute.
|
||||||
|
incorrect_spans_key (Optional[str]): Optional key into span groups of
|
||||||
|
entities known to be non-entities.
|
||||||
"""
|
"""
|
||||||
return EntityRecognizer(
|
return EntityRecognizer(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
|
@ -152,11 +161,10 @@ def make_beam_ner(
|
||||||
moves=moves,
|
moves=moves,
|
||||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||||
multitasks=[],
|
multitasks=[],
|
||||||
min_action_freq=1,
|
|
||||||
learn_tokens=False,
|
|
||||||
beam_width=beam_width,
|
beam_width=beam_width,
|
||||||
beam_density=beam_density,
|
beam_density=beam_density,
|
||||||
beam_update_prob=beam_update_prob,
|
beam_update_prob=beam_update_prob,
|
||||||
|
incorrect_spans_key=incorrect_spans_key
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -167,6 +175,37 @@ cdef class EntityRecognizer(Parser):
|
||||||
"""
|
"""
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab,
|
||||||
|
model,
|
||||||
|
name="ner",
|
||||||
|
moves=None,
|
||||||
|
*,
|
||||||
|
update_with_oracle_cut_size=100,
|
||||||
|
beam_width=1,
|
||||||
|
beam_density=0.0,
|
||||||
|
beam_update_prob=0.0,
|
||||||
|
multitasks=tuple(),
|
||||||
|
incorrect_spans_key=None,
|
||||||
|
):
|
||||||
|
"""Create an EntityRecognizer.
|
||||||
|
"""
|
||||||
|
super().__init__(
|
||||||
|
vocab,
|
||||||
|
model,
|
||||||
|
name,
|
||||||
|
moves,
|
||||||
|
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||||
|
min_action_freq=1, # not relevant for NER
|
||||||
|
learn_tokens=False, # not relevant for NER
|
||||||
|
beam_width=beam_width,
|
||||||
|
beam_density=beam_density,
|
||||||
|
beam_update_prob=beam_update_prob,
|
||||||
|
multitasks=multitasks,
|
||||||
|
incorrect_spans_key=incorrect_spans_key,
|
||||||
|
)
|
||||||
|
|
||||||
def add_multitask_objective(self, mt_component):
|
def add_multitask_objective(self, mt_component):
|
||||||
"""Register another component as a multi-task objective. Experimental."""
|
"""Register another component as a multi-task objective. Experimental."""
|
||||||
self._multitasks.append(mt_component)
|
self._multitasks.append(mt_component)
|
||||||
|
|
|
@ -29,6 +29,7 @@ from ..training import validate_examples, validate_get_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser(TrainablePipe):
|
cdef class Parser(TrainablePipe):
|
||||||
"""
|
"""
|
||||||
Base class of the DependencyParser and EntityRecognizer.
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
|
@ -48,15 +49,43 @@ cdef class Parser(TrainablePipe):
|
||||||
beam_density=0.0,
|
beam_density=0.0,
|
||||||
beam_update_prob=0.0,
|
beam_update_prob=0.0,
|
||||||
multitasks=tuple(),
|
multitasks=tuple(),
|
||||||
|
incorrect_spans_key=None
|
||||||
):
|
):
|
||||||
"""Create a Parser.
|
"""Create a Parser.
|
||||||
|
|
||||||
vocab (Vocab): The vocabulary object. Must be shared with documents
|
vocab (Vocab): The vocabulary object. Must be shared with documents
|
||||||
to be processed. The value is set to the `.vocab` attribute.
|
to be processed. The value is set to the `.vocab` attribute.
|
||||||
**cfg: Configuration parameters. Set to the `.cfg` attribute.
|
model (Model): The model for the transition-based parser. The model needs
|
||||||
If it doesn't include a value for 'moves', a new instance is
|
to have a specific substructure of named components --- see the
|
||||||
created with `self.TransitionSystem()`. This defines how the
|
spacy.ml.tb_framework.TransitionModel for details.
|
||||||
parse-state is created, updated and evaluated.
|
name (str): The name of the pipeline component
|
||||||
|
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
|
||||||
|
updated and evaluated. If 'moves' is None, a new instance is
|
||||||
|
created with `self.TransitionSystem()`. Defaults to `None`.
|
||||||
|
update_with_oracle_cut_size (int): During training, cut long sequences into
|
||||||
|
shorter segments by creating intermediate states based on the gold-standard
|
||||||
|
history. The model is not very sensitive to this parameter, so you usually
|
||||||
|
won't need to change it. 100 is a good default.
|
||||||
|
min_action_freq (int): The minimum frequency of labelled actions to retain.
|
||||||
|
Rarer labelled actions have their label backed-off to "dep". While this
|
||||||
|
primarily affects the label accuracy, it can also affect the attachment
|
||||||
|
structure, as the labels are used to represent the pseudo-projectivity
|
||||||
|
transformation.
|
||||||
|
learn_tokens (bool): Whether to learn to merge subtokens that are split
|
||||||
|
relative to the gold standard. Experimental.
|
||||||
|
beam_width (int): The number of candidate analyses to maintain.
|
||||||
|
beam_density (float): The minimum ratio between the scores of the first and
|
||||||
|
last candidates in the beam. This allows the parser to avoid exploring
|
||||||
|
candidates that are too far behind. This is mostly intended to improve
|
||||||
|
efficiency, but it can also improve accuracy as deeper search is not
|
||||||
|
always better.
|
||||||
|
beam_update_prob (float): The chance of making a beam update, instead of a
|
||||||
|
greedy update. Greedy updates are an approximation for the beam updates,
|
||||||
|
and are faster to compute.
|
||||||
|
multitasks: additional multi-tasking components. Experimental.
|
||||||
|
incorrect_spans_key (Optional[str]): Identifies spans that are known
|
||||||
|
to be incorrect entity annotations. The incorrect entity annotations
|
||||||
|
can be stored in the span group, under this key.
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.name = name
|
self.name = name
|
||||||
|
@ -68,11 +97,16 @@ cdef class Parser(TrainablePipe):
|
||||||
"learn_tokens": learn_tokens,
|
"learn_tokens": learn_tokens,
|
||||||
"beam_width": beam_width,
|
"beam_width": beam_width,
|
||||||
"beam_density": beam_density,
|
"beam_density": beam_density,
|
||||||
"beam_update_prob": beam_update_prob
|
"beam_update_prob": beam_update_prob,
|
||||||
|
"incorrect_spans_key": incorrect_spans_key
|
||||||
}
|
}
|
||||||
if moves is None:
|
if moves is None:
|
||||||
# defined by EntityRecognizer as a BiluoPushDown
|
# EntityRecognizer -> BiluoPushDown
|
||||||
moves = self.TransitionSystem(self.vocab.strings)
|
# DependencyParser -> ArcEager
|
||||||
|
moves = self.TransitionSystem(
|
||||||
|
self.vocab.strings,
|
||||||
|
incorrect_spans_key=incorrect_spans_key
|
||||||
|
)
|
||||||
self.moves = moves
|
self.moves = moves
|
||||||
self.model = model
|
self.model = model
|
||||||
if self.moves.n_moves != 0:
|
if self.moves.n_moves != 0:
|
||||||
|
@ -118,6 +152,10 @@ cdef class Parser(TrainablePipe):
|
||||||
# Available for subclasses, e.g. to deprojectivize
|
# Available for subclasses, e.g. to deprojectivize
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def incorrect_spans_key(self):
|
||||||
|
return self.cfg["incorrect_spans_key"]
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
resized = False
|
resized = False
|
||||||
for action in self.moves.action_types:
|
for action in self.moves.action_types:
|
||||||
|
@ -326,7 +364,6 @@ cdef class Parser(TrainablePipe):
|
||||||
)
|
)
|
||||||
for multitask in self._multitasks:
|
for multitask in self._multitasks:
|
||||||
multitask.update(examples, drop=drop, sgd=sgd)
|
multitask.update(examples, drop=drop, sgd=sgd)
|
||||||
|
|
||||||
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
|
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
|
||||||
if n_examples == 0:
|
if n_examples == 0:
|
||||||
return losses
|
return losses
|
||||||
|
@ -554,7 +591,7 @@ cdef class Parser(TrainablePipe):
|
||||||
self._resize()
|
self._resize()
|
||||||
self.model.from_bytes(bytes_data)
|
self.model.from_bytes(bytes_data)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise ValueError(Errors.E149) from None
|
raise ValueError(Errors.E149)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple()):
|
def to_bytes(self, exclude=tuple()):
|
||||||
|
|
|
@ -18,14 +18,9 @@ def _ner_example(ner):
|
||||||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
text = ["This", "is", "a", "lion"]
|
text = ["This", "is", "a", "lion"]
|
||||||
doc = Doc(en_vocab, words=text)
|
doc = Doc(en_vocab, words=text)
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(en_vocab, model, **config)
|
ner = EntityRecognizer(en_vocab, model)
|
||||||
ner.initialize(lambda: [_ner_example(ner)])
|
ner.initialize(lambda: [_ner_example(ner)])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
|
|
||||||
|
@ -40,14 +35,9 @@ def test_ents_reset(en_vocab):
|
||||||
"""Ensure that resetting doc.ents does not change anything"""
|
"""Ensure that resetting doc.ents does not change anything"""
|
||||||
text = ["This", "is", "a", "lion"]
|
text = ["This", "is", "a", "lion"]
|
||||||
doc = Doc(en_vocab, words=text)
|
doc = Doc(en_vocab, words=text)
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(en_vocab, model, **config)
|
ner = EntityRecognizer(en_vocab, model)
|
||||||
ner.initialize(lambda: [_ner_example(ner)])
|
ner.initialize(lambda: [_ner_example(ner)])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
orig_iobs = [t.ent_iob_ for t in doc]
|
orig_iobs = [t.ent_iob_ for t in doc]
|
||||||
|
|
|
@ -18,14 +18,9 @@ def vocab():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
parser = DependencyParser(vocab, model, **config)
|
parser = DependencyParser(vocab, model)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,19 +72,14 @@ def test_add_label(parser):
|
||||||
|
|
||||||
|
|
||||||
def test_add_label_deserializes_correctly():
|
def test_add_label_deserializes_correctly():
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner1 = EntityRecognizer(Vocab(), model, **config)
|
ner1 = EntityRecognizer(Vocab(), model)
|
||||||
ner1.add_label("C")
|
ner1.add_label("C")
|
||||||
ner1.add_label("B")
|
ner1.add_label("B")
|
||||||
ner1.add_label("A")
|
ner1.add_label("A")
|
||||||
ner1.initialize(lambda: [_ner_example(ner1)])
|
ner1.initialize(lambda: [_ner_example(ner1)])
|
||||||
ner2 = EntityRecognizer(Vocab(), model, **config)
|
ner2 = EntityRecognizer(Vocab(), model)
|
||||||
|
|
||||||
# the second model needs to be resized before we can call from_bytes
|
# the second model needs to be resized before we can call from_bytes
|
||||||
ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)
|
ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)
|
||||||
|
@ -113,12 +103,7 @@ def test_add_label_get_label(pipe_cls, n_moves, model_config):
|
||||||
"""
|
"""
|
||||||
labels = ["A", "B", "C"]
|
labels = ["A", "B", "C"]
|
||||||
model = registry.resolve({"model": model_config}, validate=True)["model"]
|
model = registry.resolve({"model": model_config}, validate=True)["model"]
|
||||||
config = {
|
pipe = pipe_cls(Vocab(), model)
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
|
||||||
pipe = pipe_cls(Vocab(), model, **config)
|
|
||||||
for label in labels:
|
for label in labels:
|
||||||
pipe.add_label(label)
|
pipe.add_label(label)
|
||||||
assert len(pipe.move_names) == len(labels) * n_moves
|
assert len(pipe.move_names) == len(labels) * n_moves
|
||||||
|
|
|
@ -130,14 +130,9 @@ def test_get_oracle_actions():
|
||||||
deps.append(dep)
|
deps.append(dep)
|
||||||
ents.append(ent)
|
ents.append(ent)
|
||||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 0,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
parser = DependencyParser(doc.vocab, model, **config)
|
parser = DependencyParser(doc.vocab, model)
|
||||||
parser.moves.add_action(0, "")
|
parser.moves.add_action(0, "")
|
||||||
parser.moves.add_action(1, "")
|
parser.moves.add_action(1, "")
|
||||||
parser.moves.add_action(1, "")
|
parser.moves.add_action(1, "")
|
||||||
|
|
|
@ -9,11 +9,12 @@ from spacy.lookups import Lookups
|
||||||
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab, registry
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
from ...pipeline import EntityRecognizer
|
||||||
|
from ...pipeline.ner import DEFAULT_NER_MODEL
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||||
|
@ -21,6 +22,11 @@ TRAIN_DATA = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def neg_key():
|
||||||
|
return "non_entities"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def vocab():
|
def vocab():
|
||||||
return Vocab()
|
return Vocab()
|
||||||
|
@ -59,39 +65,70 @@ def test_get_oracle_moves(tsys, doc, entity_annots):
|
||||||
assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
|
assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
def test_negative_samples_two_word_input(tsys, vocab, neg_key):
|
||||||
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
|
"""Test that we don't get stuck in a two word input when we have a negative
|
||||||
entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
|
span. This could happen if we don't have the right check on the B action.
|
||||||
|
"""
|
||||||
|
tsys.cfg["neg_key"] = neg_key
|
||||||
|
doc = Doc(vocab, words=["A", "B"])
|
||||||
|
entity_annots = [None, None]
|
||||||
example = Example.from_dict(doc, {"entities": entity_annots})
|
example = Example.from_dict(doc, {"entities": entity_annots})
|
||||||
ex_dict = example.to_dict()
|
# These mean that the oracle sequence shouldn't have O for the first
|
||||||
|
# word, and it shouldn't analyse it as B-PERSON, L-PERSON
|
||||||
for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]):
|
example.y.spans[neg_key] = [
|
||||||
if tag == "L-!GPE":
|
Span(example.y, 0, 1, label="O"),
|
||||||
ex_dict["doc_annotation"]["entities"][i] = "-"
|
Span(example.y, 0, 2, label="PERSON"),
|
||||||
example = Example.from_dict(doc, ex_dict)
|
]
|
||||||
|
|
||||||
act_classes = tsys.get_oracle_sequence(example)
|
act_classes = tsys.get_oracle_sequence(example)
|
||||||
names = [tsys.get_class_name(act) for act in act_classes]
|
names = [tsys.get_class_name(act) for act in act_classes]
|
||||||
assert names
|
assert names
|
||||||
|
assert names[0] != "O"
|
||||||
|
assert names[0] != "B-PERSON"
|
||||||
|
assert names[1] != "L-PERSON"
|
||||||
|
|
||||||
|
|
||||||
def test_get_oracle_moves_negative_entities2(tsys, vocab):
|
def test_negative_samples_three_word_input(tsys, vocab, neg_key):
|
||||||
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
"""Test that we exclude a 2-word entity correctly using a negative example."""
|
||||||
entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
|
tsys.cfg["neg_key"] = neg_key
|
||||||
|
doc = Doc(vocab, words=["A", "B", "C"])
|
||||||
|
entity_annots = [None, None, None]
|
||||||
example = Example.from_dict(doc, {"entities": entity_annots})
|
example = Example.from_dict(doc, {"entities": entity_annots})
|
||||||
|
# These mean that the oracle sequence shouldn't have O for the first
|
||||||
|
# word, and it shouldn't analyse it as B-PERSON, L-PERSON
|
||||||
|
example.y.spans[neg_key] = [
|
||||||
|
Span(example.y, 0, 1, label="O"),
|
||||||
|
Span(example.y, 0, 2, label="PERSON"),
|
||||||
|
]
|
||||||
act_classes = tsys.get_oracle_sequence(example)
|
act_classes = tsys.get_oracle_sequence(example)
|
||||||
names = [tsys.get_class_name(act) for act in act_classes]
|
names = [tsys.get_class_name(act) for act in act_classes]
|
||||||
assert names
|
assert names
|
||||||
|
assert names[0] != "O"
|
||||||
|
assert names[1] != "B-PERSON"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Maybe outdated? Unsure")
|
def test_negative_samples_U_entity(tsys, vocab, neg_key):
|
||||||
def test_get_oracle_moves_negative_O(tsys, vocab):
|
"""Test that we exclude a 2-word entity correctly using a negative example."""
|
||||||
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
tsys.cfg["neg_key"] = neg_key
|
||||||
entity_annots = ["O", "!O", "O", "!O"]
|
doc = Doc(vocab, words=["A"])
|
||||||
|
entity_annots = [None]
|
||||||
example = Example.from_dict(doc, {"entities": entity_annots})
|
example = Example.from_dict(doc, {"entities": entity_annots})
|
||||||
|
# These mean that the oracle sequence shouldn't have O for the first
|
||||||
|
# word, and it shouldn't analyse it as B-PERSON, L-PERSON
|
||||||
|
example.y.spans[neg_key] = [
|
||||||
|
Span(example.y, 0, 1, label="O"),
|
||||||
|
Span(example.y, 0, 1, label="PERSON"),
|
||||||
|
]
|
||||||
act_classes = tsys.get_oracle_sequence(example)
|
act_classes = tsys.get_oracle_sequence(example)
|
||||||
names = [tsys.get_class_name(act) for act in act_classes]
|
names = [tsys.get_class_name(act) for act in act_classes]
|
||||||
assert names
|
assert names
|
||||||
|
assert names[0] != "O"
|
||||||
|
assert names[0] != "U-PERSON"
|
||||||
|
|
||||||
|
|
||||||
|
def test_negative_sample_key_is_in_config(vocab, entity_types):
|
||||||
|
actions = BiluoPushDown.get_actions(entity_types=entity_types)
|
||||||
|
tsys = BiluoPushDown(vocab.strings, actions, incorrect_spans_key="non_entities")
|
||||||
|
assert tsys.cfg["neg_key"] == "non_entities"
|
||||||
|
|
||||||
|
|
||||||
# We can't easily represent this on a Doc object. Not sure what the best solution
|
# We can't easily represent this on a Doc object. Not sure what the best solution
|
||||||
|
@ -213,6 +250,27 @@ def test_train_empty():
|
||||||
nlp.update(batch, losses=losses)
|
nlp.update(batch, losses=losses)
|
||||||
|
|
||||||
|
|
||||||
|
def test_train_negative_deprecated():
|
||||||
|
"""Test that the deprecated negative entity format raises a custom error."""
|
||||||
|
train_data = [
|
||||||
|
("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}),
|
||||||
|
]
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
train_examples = []
|
||||||
|
for t in train_data:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
ner = nlp.add_pipe("ner", last=True)
|
||||||
|
ner.add_label("PERSON")
|
||||||
|
nlp.initialize()
|
||||||
|
for itn in range(2):
|
||||||
|
losses = {}
|
||||||
|
batches = util.minibatch(train_examples, size=8)
|
||||||
|
for batch in batches:
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.update(batch, losses=losses)
|
||||||
|
|
||||||
|
|
||||||
def test_overwrite_token():
|
def test_overwrite_token():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("ner")
|
nlp.add_pipe("ner")
|
||||||
|
@ -265,6 +323,16 @@ def test_ruler_before_ner():
|
||||||
assert [token.ent_type_ for token in doc] == expected_types
|
assert [token.ent_type_ for token in doc] == expected_types
|
||||||
|
|
||||||
|
|
||||||
|
def test_ner_constructor(en_vocab):
|
||||||
|
config = {
|
||||||
|
"update_with_oracle_cut_size": 100,
|
||||||
|
}
|
||||||
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
|
ner_1 = EntityRecognizer(en_vocab, model, **config)
|
||||||
|
ner_2 = EntityRecognizer(en_vocab, model)
|
||||||
|
|
||||||
|
|
||||||
def test_ner_before_ruler():
|
def test_ner_before_ruler():
|
||||||
""" Test that an entity_ruler works after an NER: the second can overwrite O annotations """
|
""" Test that an entity_ruler works after an NER: the second can overwrite O annotations """
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -414,7 +482,7 @@ def test_beam_ner_scores():
|
||||||
assert 0 - eps <= score <= 1 + eps
|
assert 0 - eps <= score <= 1 + eps
|
||||||
|
|
||||||
|
|
||||||
def test_beam_overfitting_IO():
|
def test_beam_overfitting_IO(neg_key):
|
||||||
# Simple test to try and quickly overfit the Beam NER component
|
# Simple test to try and quickly overfit the Beam NER component
|
||||||
nlp = English()
|
nlp = English()
|
||||||
beam_width = 16
|
beam_width = 16
|
||||||
|
@ -422,6 +490,7 @@ def test_beam_overfitting_IO():
|
||||||
config = {
|
config = {
|
||||||
"beam_width": beam_width,
|
"beam_width": beam_width,
|
||||||
"beam_density": beam_density,
|
"beam_density": beam_density,
|
||||||
|
"incorrect_spans_key": neg_key,
|
||||||
}
|
}
|
||||||
ner = nlp.add_pipe("beam_ner", config=config)
|
ner = nlp.add_pipe("beam_ner", config=config)
|
||||||
train_examples = []
|
train_examples = []
|
||||||
|
@ -438,12 +507,13 @@ def test_beam_overfitting_IO():
|
||||||
assert losses["beam_ner"] < 0.0001
|
assert losses["beam_ner"] < 0.0001
|
||||||
|
|
||||||
# test the scores from the beam
|
# test the scores from the beam
|
||||||
test_text = "I like London."
|
test_text = "I like London"
|
||||||
docs = [nlp.make_doc(test_text)]
|
docs = [nlp.make_doc(test_text)]
|
||||||
beams = ner.predict(docs)
|
beams = ner.predict(docs)
|
||||||
entity_scores = ner.scored_ents(beams)[0]
|
entity_scores = ner.scored_ents(beams)[0]
|
||||||
assert entity_scores[(2, 3, "LOC")] == 1.0
|
assert entity_scores[(2, 3, "LOC")] == 1.0
|
||||||
assert entity_scores[(2, 3, "PERSON")] == 0.0
|
assert entity_scores[(2, 3, "PERSON")] == 0.0
|
||||||
|
assert len(nlp(test_text).ents) == 1
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
|
@ -456,6 +526,104 @@ def test_beam_overfitting_IO():
|
||||||
assert entity_scores2[(2, 3, "LOC")] == 1.0
|
assert entity_scores2[(2, 3, "LOC")] == 1.0
|
||||||
assert entity_scores2[(2, 3, "PERSON")] == 0.0
|
assert entity_scores2[(2, 3, "PERSON")] == 0.0
|
||||||
|
|
||||||
|
# Try to unlearn the entity by using negative annotations
|
||||||
|
neg_doc = nlp.make_doc(test_text)
|
||||||
|
neg_ex = Example(neg_doc, neg_doc)
|
||||||
|
neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")]
|
||||||
|
neg_train_examples = [neg_ex]
|
||||||
|
|
||||||
|
for i in range(20):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(neg_train_examples, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
# test the "untrained" model
|
||||||
|
assert len(nlp(test_text).ents) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_neg_annotation(neg_key):
|
||||||
|
"""Check that the NER update works with a negative annotation that is a different label of the correct one,
|
||||||
|
or partly overlapping, etc"""
|
||||||
|
nlp = English()
|
||||||
|
beam_width = 16
|
||||||
|
beam_density = 0.0001
|
||||||
|
config = {
|
||||||
|
"beam_width": beam_width,
|
||||||
|
"beam_density": beam_density,
|
||||||
|
"incorrect_spans_key": neg_key,
|
||||||
|
}
|
||||||
|
ner = nlp.add_pipe("beam_ner", config=config)
|
||||||
|
train_text = "Who is Shaka Khan?"
|
||||||
|
neg_doc = nlp.make_doc(train_text)
|
||||||
|
ner.add_label("PERSON")
|
||||||
|
ner.add_label("ORG")
|
||||||
|
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
|
||||||
|
example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "ORG"), Span(neg_doc, 2, 3, "PERSON"), Span(neg_doc, 1, 4, "PERSON")]
|
||||||
|
|
||||||
|
optimizer = nlp.initialize()
|
||||||
|
for i in range(2):
|
||||||
|
losses = {}
|
||||||
|
nlp.update([example], sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
|
||||||
|
def test_neg_annotation_conflict(neg_key):
|
||||||
|
# Check that NER raises for a negative annotation that is THE SAME as a correct one
|
||||||
|
nlp = English()
|
||||||
|
beam_width = 16
|
||||||
|
beam_density = 0.0001
|
||||||
|
config = {
|
||||||
|
"beam_width": beam_width,
|
||||||
|
"beam_density": beam_density,
|
||||||
|
"incorrect_spans_key": neg_key,
|
||||||
|
}
|
||||||
|
ner = nlp.add_pipe("beam_ner", config=config)
|
||||||
|
train_text = "Who is Shaka Khan?"
|
||||||
|
neg_doc = nlp.make_doc(train_text)
|
||||||
|
ner.add_label("PERSON")
|
||||||
|
ner.add_label("LOC")
|
||||||
|
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
|
||||||
|
example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")]
|
||||||
|
assert len(example.reference.ents) == 1
|
||||||
|
assert example.reference.ents[0].text == "Shaka Khan"
|
||||||
|
assert example.reference.ents[0].label_ == "PERSON"
|
||||||
|
assert len(example.reference.spans[neg_key]) == 1
|
||||||
|
assert example.reference.spans[neg_key][0].text == "Shaka Khan"
|
||||||
|
assert example.reference.spans[neg_key][0].label_ == "PERSON"
|
||||||
|
|
||||||
|
optimizer = nlp.initialize()
|
||||||
|
for i in range(2):
|
||||||
|
losses = {}
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.update([example], sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
|
||||||
|
def test_beam_valid_parse(neg_key):
|
||||||
|
"""Regression test for previously flakey behaviour"""
|
||||||
|
nlp = English()
|
||||||
|
beam_width = 16
|
||||||
|
beam_density = 0.0001
|
||||||
|
config = {
|
||||||
|
"beam_width": beam_width,
|
||||||
|
"beam_density": beam_density,
|
||||||
|
"incorrect_spans_key": neg_key,
|
||||||
|
}
|
||||||
|
nlp.add_pipe("beam_ner", config=config)
|
||||||
|
# fmt: off
|
||||||
|
tokens = ['FEDERAL', 'NATIONAL', 'MORTGAGE', 'ASSOCIATION', '(', 'Fannie', 'Mae', '):', 'Posted', 'yields', 'on', '30', 'year', 'mortgage', 'commitments', 'for', 'delivery', 'within', '30', 'days', '(', 'priced', 'at', 'par', ')', '9.75', '%', ',', 'standard', 'conventional', 'fixed', '-', 'rate', 'mortgages', ';', '8.70', '%', ',', '6/2', 'rate', 'capped', 'one', '-', 'year', 'adjustable', 'rate', 'mortgages', '.', 'Source', ':', 'Telerate', 'Systems', 'Inc.']
|
||||||
|
iob = ['B-ORG', 'I-ORG', 'I-ORG', 'L-ORG', 'O', 'B-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'U-CARDINAL', 'O', 'O', 'B-DATE', 'I-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
doc = Doc(nlp.vocab, words=tokens)
|
||||||
|
example = Example.from_dict(doc, {"ner": iob})
|
||||||
|
neg_span = Span(doc, 50, 53, "ORG")
|
||||||
|
example.reference.spans[neg_key] = [neg_span]
|
||||||
|
|
||||||
|
optimizer = nlp.initialize()
|
||||||
|
|
||||||
|
for i in range(5):
|
||||||
|
losses = {}
|
||||||
|
nlp.update([example], sgd=optimizer, losses=losses)
|
||||||
|
assert "beam_ner" in losses
|
||||||
|
|
||||||
|
|
||||||
def test_ner_warns_no_lookups(caplog):
|
def test_ner_warns_no_lookups(caplog):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
|
@ -5,10 +5,11 @@ from spacy.attrs import DEP
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy import util
|
from spacy import util, registry
|
||||||
|
|
||||||
from ..util import apply_transition_sequence, make_tempdir
|
from ..util import apply_transition_sequence, make_tempdir
|
||||||
|
from ...pipeline import DependencyParser
|
||||||
|
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
(
|
(
|
||||||
|
@ -215,6 +216,18 @@ def test_parser_set_sent_starts(en_vocab):
|
||||||
assert token.head in sent
|
assert token.head in sent
|
||||||
|
|
||||||
|
|
||||||
|
def test_parser_constructor(en_vocab):
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"update_with_oracle_cut_size": 100,
|
||||||
|
}
|
||||||
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
|
parser_1 = DependencyParser(en_vocab, model, **config)
|
||||||
|
parser_2 = DependencyParser(en_vocab, model)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||||
def test_incomplete_data(pipe_name):
|
def test_incomplete_data(pipe_name):
|
||||||
# Test that the parser works with incomplete information
|
# Test that the parser works with incomplete information
|
||||||
|
|
|
@ -23,14 +23,9 @@ def _parser_example(parser):
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
vocab.strings.add("ROOT")
|
vocab.strings.add("ROOT")
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
parser = DependencyParser(vocab, model, **config)
|
parser = DependencyParser(vocab, model)
|
||||||
parser.cfg["token_vector_width"] = 4
|
parser.cfg["token_vector_width"] = 4
|
||||||
parser.cfg["hidden_width"] = 32
|
parser.cfg["hidden_width"] = 32
|
||||||
# parser.add_label('right')
|
# parser.add_label('right')
|
||||||
|
|
|
@ -190,14 +190,9 @@ def test_issue3345():
|
||||||
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
|
||||||
doc[4].is_sent_start = True
|
doc[4].is_sent_start = True
|
||||||
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
|
||||||
cfg = {"model": DEFAULT_NER_MODEL}
|
cfg = {"model": DEFAULT_NER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
ner = EntityRecognizer(doc.vocab, model, **config)
|
ner = EntityRecognizer(doc.vocab, model)
|
||||||
# Add the OUT action. I wouldn't have thought this would be necessary...
|
# Add the OUT action. I wouldn't have thought this would be necessary...
|
||||||
ner.moves.add_action(5, "")
|
ner.moves.add_action(5, "")
|
||||||
ner.add_label("GPE")
|
ner.add_label("GPE")
|
||||||
|
|
|
@ -259,8 +259,6 @@ def test_issue3830_no_subtok():
|
||||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||||
config = {
|
config = {
|
||||||
"learn_tokens": False,
|
"learn_tokens": False,
|
||||||
"min_action_freq": 30,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
}
|
||||||
model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
|
model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
|
||||||
parser = DependencyParser(Vocab(), model, **config)
|
parser = DependencyParser(Vocab(), model, **config)
|
||||||
|
@ -274,8 +272,6 @@ def test_issue3830_with_subtok():
|
||||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||||
config = {
|
config = {
|
||||||
"learn_tokens": True,
|
"learn_tokens": True,
|
||||||
"min_action_freq": 30,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
}
|
||||||
model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
|
model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
|
||||||
parser = DependencyParser(Vocab(), model, **config)
|
parser = DependencyParser(Vocab(), model, **config)
|
||||||
|
|
|
@ -61,8 +61,6 @@ def taggers(en_vocab):
|
||||||
@pytest.mark.parametrize("Parser", test_parsers)
|
@pytest.mark.parametrize("Parser", test_parsers)
|
||||||
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
||||||
config = {
|
config = {
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 0,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
"update_with_oracle_cut_size": 100,
|
||||||
"beam_width": 1,
|
"beam_width": 1,
|
||||||
"beam_update_prob": 1.0,
|
"beam_update_prob": 1.0,
|
||||||
|
@ -70,8 +68,8 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
||||||
}
|
}
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
parser = Parser(en_vocab, model, **config)
|
parser = Parser(en_vocab, model)
|
||||||
new_parser = Parser(en_vocab, model, **config)
|
new_parser = Parser(en_vocab, model)
|
||||||
new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"]))
|
new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"]))
|
||||||
bytes_2 = new_parser.to_bytes(exclude=["vocab"])
|
bytes_2 = new_parser.to_bytes(exclude=["vocab"])
|
||||||
bytes_3 = parser.to_bytes(exclude=["vocab"])
|
bytes_3 = parser.to_bytes(exclude=["vocab"])
|
||||||
|
@ -84,43 +82,27 @@ def test_serialize_parser_strings(Parser):
|
||||||
vocab1 = Vocab()
|
vocab1 = Vocab()
|
||||||
label = "FunnyLabel"
|
label = "FunnyLabel"
|
||||||
assert label not in vocab1.strings
|
assert label not in vocab1.strings
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 0,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
"beam_density": 0.0,
|
|
||||||
}
|
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
parser1 = Parser(vocab1, model, **config)
|
parser1 = Parser(vocab1, model)
|
||||||
parser1.add_label(label)
|
parser1.add_label(label)
|
||||||
assert label in parser1.vocab.strings
|
assert label in parser1.vocab.strings
|
||||||
vocab2 = Vocab()
|
vocab2 = Vocab()
|
||||||
assert label not in vocab2.strings
|
assert label not in vocab2.strings
|
||||||
parser2 = Parser(vocab2, model, **config)
|
parser2 = Parser(vocab2, model)
|
||||||
parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
|
parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
|
||||||
assert label in parser2.vocab.strings
|
assert label in parser2.vocab.strings
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("Parser", test_parsers)
|
@pytest.mark.parametrize("Parser", test_parsers)
|
||||||
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 0,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
"beam_density": 0.0,
|
|
||||||
}
|
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
parser = Parser(en_vocab, model, **config)
|
parser = Parser(en_vocab, model)
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / "parser"
|
file_path = d / "parser"
|
||||||
parser.to_disk(file_path)
|
parser.to_disk(file_path)
|
||||||
parser_d = Parser(en_vocab, model, **config)
|
parser_d = Parser(en_vocab, model)
|
||||||
parser_d = parser_d.from_disk(file_path)
|
parser_d = parser_d.from_disk(file_path)
|
||||||
parser_bytes = parser.to_bytes(exclude=["model", "vocab"])
|
parser_bytes = parser.to_bytes(exclude=["model", "vocab"])
|
||||||
parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"])
|
parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"])
|
||||||
|
@ -198,17 +180,12 @@ def test_serialize_textcat_empty(en_vocab):
|
||||||
def test_serialize_pipe_exclude(en_vocab, Parser):
|
def test_serialize_pipe_exclude(en_vocab, Parser):
|
||||||
cfg = {"model": DEFAULT_PARSER_MODEL}
|
cfg = {"model": DEFAULT_PARSER_MODEL}
|
||||||
model = registry.resolve(cfg, validate=True)["model"]
|
model = registry.resolve(cfg, validate=True)["model"]
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 0,
|
|
||||||
"update_with_oracle_cut_size": 100,
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_new_parser():
|
def get_new_parser():
|
||||||
new_parser = Parser(en_vocab, model, **config)
|
new_parser = Parser(en_vocab, model)
|
||||||
return new_parser
|
return new_parser
|
||||||
|
|
||||||
parser = Parser(en_vocab, model, **config)
|
parser = Parser(en_vocab, model)
|
||||||
parser.cfg["foo"] = "bar"
|
parser.cfg["foo"] = "bar"
|
||||||
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]))
|
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]))
|
||||||
assert "foo" in new_parser.cfg
|
assert "foo" in new_parser.cfg
|
||||||
|
|
|
@ -235,9 +235,9 @@ cdef class Example:
|
||||||
seen.update(indices)
|
seen.update(indices)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def get_aligned_ner(self):
|
def get_aligned_ents_and_ner(self):
|
||||||
if not self.y.has_annotation("ENT_IOB"):
|
if not self.y.has_annotation("ENT_IOB"):
|
||||||
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
|
return [], [None] * len(self.x)
|
||||||
x_ents = self.get_aligned_spans_y2x(self.y.ents, allow_overlap=False)
|
x_ents = self.get_aligned_spans_y2x(self.y.ents, allow_overlap=False)
|
||||||
# Default to 'None' for missing values
|
# Default to 'None' for missing values
|
||||||
x_tags = offsets_to_biluo_tags(
|
x_tags = offsets_to_biluo_tags(
|
||||||
|
@ -253,6 +253,10 @@ cdef class Example:
|
||||||
x_tags[i] = "O"
|
x_tags[i] = "O"
|
||||||
elif self.x[i].is_space:
|
elif self.x[i].is_space:
|
||||||
x_tags[i] = "O"
|
x_tags[i] = "O"
|
||||||
|
return x_ents, x_tags
|
||||||
|
|
||||||
|
def get_aligned_ner(self):
|
||||||
|
x_ents, x_tags = self.get_aligned_ents_and_ner()
|
||||||
return x_tags
|
return x_tags
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
|
|
|
@ -50,7 +50,7 @@ architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[TransitionSystem]~~ |
|
||||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
||||||
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ |
|
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ |
|
||||||
| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ |
|
| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ |
|
||||||
|
@ -88,8 +88,8 @@ shortcut for this and instantiate the component using its string name and
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ |
|
| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ |
|
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
||||||
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. ~~bool~~ |
|
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ |
|
||||||
| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |
|
| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |
|
||||||
|
|
||||||
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
|
@ -37,6 +37,7 @@ architectures and their arguments and hyperparameters.
|
||||||
> "moves": None,
|
> "moves": None,
|
||||||
> "update_with_oracle_cut_size": 100,
|
> "update_with_oracle_cut_size": 100,
|
||||||
> "model": DEFAULT_NER_MODEL,
|
> "model": DEFAULT_NER_MODEL,
|
||||||
|
> "incorrect_spans_key": "incorrect_spans",
|
||||||
> }
|
> }
|
||||||
> nlp.add_pipe("ner", config=config)
|
> nlp.add_pipe("ner", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
@ -46,6 +47,7 @@ architectures and their arguments and hyperparameters.
|
||||||
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
|
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
| `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/ner.pyx
|
%%GITHUB_SPACY/spacy/pipeline/ner.pyx
|
||||||
|
@ -72,14 +74,15 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ |
|
| `moves` | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[List[str]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ |
|
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
||||||
|
| `incorrect_spans_key` | Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group, under this key. Defaults to `None`. ~~Optional[str]~~ |
|
||||||
|
|
||||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -220,14 +223,14 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
||||||
> losses = ner.update(examples, sgd=optimizer)
|
> losses = ner.update(examples, sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
## EntityRecognizer.get_loss {#get_loss tag="method"}
|
## EntityRecognizer.get_loss {#get_loss tag="method"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user