Support negative examples in partial NER annotations (#8106)

* Support a cfg field in transition system

* Make NER 'has gold' check use right alignment for span

* Pass 'negative_samples_key' property into NER transition system

* Add field for negative samples to NER transition system

* Check neg_key in NER has_gold

* Support negative examples in NER oracle

* Test for negative examples in NER

* Fix name of config variable in NER

* Remove vestiges of old-style partial annotation

* Remove obsolete tests

* Add comment noting lack of support for negative samples in parser

* Additions to "neg examples" PR (#8201)

* add custom error and test for deprecated format

* add test for unlearning an entity

* add break also for Begin's cost

* add negative_samples_key property on Parser

* rename

* extend docs & fix some older docs issues

* add subclass constructors, clean up tests, fix docs

* add flaky test with ValueError if gold parse was not found

* remove ValueError if n_gold == 0

* fix docstring

* Hack in environment variables to try out training

* Remove hack

* Remove NER hack, and support 'negative O' samples

* Fix O oracle

* Fix transition parser

* Remove 'not O' from oracle

* Fix NER oracle

* check for spans in both gold.ents and gold.spans and raise if so, to prevent memory access violation

* use set instead of list in consistency check

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Matthew Honnibal 2021-06-17 17:33:00 +10:00 committed by GitHub
parent 02bac8f269
commit 6f5e308d17
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 558 additions and 236 deletions

View File

@ -521,6 +521,13 @@ class Errors:
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
# New errors added in v3.x
E868 = ("Found a conflicting gold annotation in a reference document, "
"with the following char-based span occurring both in the gold ents "
"as well as in the negative spans: {span}.")
E869 = ("The notation '{label}' is not supported anymore. To annotate "
"negative NER samples, use `doc.spans[key]` instead, and "
"specify the key as 'incorrect_spans_key' when constructing "
"the NER component.")
E870 = ("Could not serialize the DocBin because it is too large. Consider "
"splitting up your documents into several doc bins and serializing "
"each separately. spacy.Corpus.v1 will search recursively for all "

View File

@ -1,3 +1,5 @@
import os
import random
from libc.stdint cimport int32_t
from cymem.cymem cimport Pool
@ -6,10 +8,11 @@ from thinc.extra.search cimport Beam
from ...tokens.doc cimport Doc
from ...tokens.span import Span
from ...tokens.span cimport Span
from ...typedefs cimport weight_t, attr_t
from ...lexeme cimport Lexeme
from ...attrs cimport IS_SPACE
from ...structs cimport TokenC
from ...structs cimport TokenC, SpanC
from ...training.example cimport Example
from .stateclass cimport StateClass
from ._state cimport StateC
@ -25,7 +28,6 @@ cdef enum:
LAST
UNIT
OUT
ISNT
N_MOVES
@ -36,39 +38,62 @@ MOVE_NAMES[IN] = 'I'
MOVE_NAMES[LAST] = 'L'
MOVE_NAMES[UNIT] = 'U'
MOVE_NAMES[OUT] = 'O'
MOVE_NAMES[ISNT] = 'x'
cdef struct GoldNERStateC:
Transition* ner
SpanC* negs
int32_t length
int32_t nr_neg
cdef class BiluoGold:
cdef Pool mem
cdef GoldNERStateC c
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example):
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key):
self.mem = Pool()
self.c = create_gold_state(self.mem, moves, stcls.c, example)
self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key)
def update(self, StateClass stcls):
update_gold_state(&self.c, stcls.c)
cdef GoldNERStateC create_gold_state(
Pool mem,
BiluoPushDown moves,
const StateC* stcls,
Example example
Example example,
neg_key
) except *:
cdef GoldNERStateC gs
cdef Span neg
if neg_key is not None:
negs = example.get_aligned_spans_y2x(
example.y.spans.get(neg_key, []),
allow_overlap=True
)
else:
negs = []
assert example.x.length > 0
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
ner_tags = example.get_aligned_ner()
gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
gs.nr_neg = len(negs)
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
for i, ner_tag in enumerate(ner_tags):
gs.ner[i] = moves.lookup_transition(ner_tag)
# Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label.
neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs}
for pos_span in ner_ents:
if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples:
raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_)))
# In order to handle negative samples, we need to maintain the full
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
# thing, we'll get blocked if there's an incorrect prefix.
for i, neg in enumerate(negs):
gs.negs[i] = neg.c
return gs
@ -156,21 +181,16 @@ cdef class BiluoPushDown(TransitionSystem):
cdef attr_t label
if name == '-' or name == '' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0)
elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0)
elif '-' in name:
move_str, label_str = name.split('-', 1)
# Hacky way to denote 'not this entity'
# Deprecated, hacky way to denote 'not this entity'
if label_str.startswith('!'):
label_str = label_str[1:]
move_str = 'x'
raise ValueError(Errors.E869.format(label=name))
label = self.strings.add(label_str)
else:
move_str = name
label = 0
move = MOVE_NAMES.index(move_str)
if move == ISNT:
return Transition(clas=0, move=ISNT, label=label, score=0)
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
@ -220,7 +240,7 @@ cdef class BiluoPushDown(TransitionSystem):
label_id = label_name
if action == OUT and label_id != 0:
return None
if action == MISSING or action == ISNT:
if action == MISSING:
return None
# Check we're not creating a move we already have, so that this is
# idempotent
@ -270,9 +290,23 @@ cdef class BiluoPushDown(TransitionSystem):
return parses
def init_gold(self, StateClass state, Example example):
return BiluoGold(self, state, example)
return BiluoGold(self, state, example, self.neg_key)
def has_gold(self, Example eg, start=0, end=None):
# We get x and y referring to X, we want to check relative to Y,
# the reference
y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]])
if not y_spans:
y_spans = [eg.y[:]]
y_span = y_spans[0]
start = y_span.start
end = y_span.end
neg_key = self.neg_key
if neg_key is not None:
# If we have any negative samples, count that as having annotation.
for span in eg.y.spans.get(neg_key, []):
if span.start >= start and span.end <= end:
return True
for word in eg.y[start:end]:
if word.ent_iob != 0:
return True
@ -306,8 +340,6 @@ cdef class BiluoPushDown(TransitionSystem):
n_gold += costs[i] <= 0
else:
costs[i] = 9000
if n_gold < 1:
raise ValueError
cdef class Missing:
@ -373,23 +405,33 @@ cdef class Begin:
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
b0 = s.B(0)
cdef int cost = 0
cdef int g_act = gold.ner[b0].move
cdef attr_t g_tag = gold.ner[b0].label
if g_act == MISSING:
return 0
pass
elif g_act == BEGIN:
# B, Gold B --> Label match
return label != g_tag
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return label == g_tag
cost += label != g_tag
else:
# B, Gold I --> False (P)
# B, Gold L --> False (P)
# B, Gold O --> False (P)
# B, Gold U --> False (P)
return 1
cost += 1
if s.buffer_length() < 3:
# Handle negatives. In general we can't really do much to block
# B, because we don't know whether the whole entity is going to
# be correct or not. However, we can at least tell whether we're
# going to be opening an entity where there's only one possible
# L.
for span in gold.negs[:gold.nr_neg]:
if span.label == label and span.start == b0:
cost += 1
break
return cost
cdef class In:
@ -462,9 +504,6 @@ cdef class In:
elif g_act == UNIT:
# I, Gold U --> True iff next tag == O
return next_act != OUT
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return 0
else:
return 1
@ -504,32 +543,41 @@ cdef class Last:
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
move = LAST
b0 = s.B(0)
ent_start = s.E(0)
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef int g_act = gold.ner[b0].move
cdef attr_t g_tag = gold.ner[b0].label
cdef int cost = 0
if g_act == MISSING:
return 0
pass
elif g_act == BEGIN:
# L, Gold B --> True
return 0
pass
elif g_act == IN:
# L, Gold I --> True iff this entity sunk
return not _entity_is_sunk(s, gold.ner)
cost += not _entity_is_sunk(s, gold.ner)
elif g_act == LAST:
# L, Gold L --> True
return 0
pass
elif g_act == OUT:
# L, Gold O --> True
return 0
pass
elif g_act == UNIT:
# L, Gold U --> True
return 0
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return 0
pass
else:
return 1
cost += 1
# If we have negative-example entities, integrate them into the objective,
# by marking actions that close an entity that we know is incorrect
# as costly.
for span in gold.negs[:gold.nr_neg]:
if span.label == label and (span.end-1) == b0 and span.start == ent_start:
cost += 1
break
return cost
cdef class Unit:
@ -573,21 +621,29 @@ cdef class Unit:
gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef int cost = 0
if g_act == MISSING:
return 0
pass
elif g_act == UNIT:
# U, Gold U --> True iff tag match
return label != g_tag
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return label == g_tag
cost += label != g_tag
else:
# U, Gold B --> False
# U, Gold I --> False
# U, Gold L --> False
# U, Gold O --> False
return 1
cost += 1
# If we have negative-example entities, integrate them into the objective.
# This is fairly straight-forward for U- entities, as we have a single
# action
cdef int b0 = s.B(0)
for span in gold.negs[:gold.nr_neg]:
if span.label == label and span.start == b0 and span.end == (b0+1):
cost += 1
break
return cost
cdef class Out:
@ -613,25 +669,24 @@ cdef class Out:
gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == ISNT and g_tag == 0:
return 1
elif g_act == MISSING or g_act == ISNT:
return 0
cdef weight_t cost = 0
if g_act == MISSING:
pass
elif g_act == BEGIN:
# O, Gold B --> False
return 1
cost += 1
elif g_act == IN:
# O, Gold I --> True
return 0
pass
elif g_act == LAST:
# O, Gold L --> True
return 0
pass
elif g_act == OUT:
# O, Gold O --> True
return 0
pass
elif g_act == UNIT:
# O, Gold U --> False
return 1
cost += 1
else:
return 1
cost += 1
return cost

View File

@ -41,6 +41,7 @@ cdef class TransitionSystem:
cdef public attr_t root_label
cdef public freqs
cdef public object labels
cdef public object cfg
cdef init_state_t init_beam_state
cdef del_state_t del_beam_state

View File

@ -33,7 +33,14 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1:
cdef class TransitionSystem:
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
def __init__(
self,
StringStore string_table,
labels_by_action=None,
min_freq=None,
incorrect_spans_key=None
):
self.cfg = {"neg_key": incorrect_spans_key}
self.mem = Pool()
self.strings = string_table
self.n_moves = 0
@ -49,8 +56,13 @@ cdef class TransitionSystem:
self.del_beam_state = _del_state
def __reduce__(self):
# TODO: This loses the 'cfg'
return (self.__class__, (self.strings, self.labels), None, None)
@property
def neg_key(self):
return self.cfg.get("neg_key")
def init_batch(self, docs):
cdef StateClass state
states = []
@ -220,16 +232,21 @@ cdef class TransitionSystem:
transitions = []
serializers = {
'moves': lambda: srsly.json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes()
'strings': lambda: self.strings.to_bytes(),
'cfg': lambda: self.cfg
}
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, exclude=tuple()):
# We're adding a new field, 'cfg', here and we don't want to break
# previous models that don't have it.
msg = srsly.msgpack_loads(bytes_data)
labels = {}
deserializers = {
'moves': lambda b: labels.update(srsly.json_loads(b)),
'strings': lambda b: self.strings.from_bytes(b)
}
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'moves' not in exclude:
labels.update(srsly.json_loads(msg['moves']))
if 'strings' not in exclude:
self.strings.from_bytes(msg['strings'])
if 'cfg' not in exclude and 'cfg' in msg:
self.cfg.update(msg['cfg'])
self.initialize_actions(labels)
return self

View File

@ -3,6 +3,7 @@ from collections import defaultdict
from typing import Optional, Iterable
from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem
from .transition_parser cimport Parser
from ._parser_internals.arc_eager cimport ArcEager
@ -59,7 +60,7 @@ def make_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[list],
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int
@ -85,13 +86,13 @@ def make_parser(
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (List[str]): A list of transition names. Inferred from the data if not
provided.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
min_action_freq (int): The minimum frequency of labelled actions to retain.
@ -112,6 +113,9 @@ def make_parser(
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None
)
@Language.factory(
@ -140,7 +144,7 @@ def make_beam_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[list],
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int,
@ -165,8 +169,13 @@ def make_beam_parser(
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (List[str]): A list of transition names. Inferred from the data if not
provided.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
beam_width (int): The number of candidate analyses to maintain.
beam_density (float): The minimum ratio between the scores of the first and
last candidates in the beam. This allows the parser to avoid exploring
@ -195,7 +204,10 @@ def make_beam_parser(
beam_update_prob=beam_update_prob,
multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq
min_action_freq=min_action_freq,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None
)
@ -206,6 +218,39 @@ cdef class DependencyParser(Parser):
"""
TransitionSystem = ArcEager
def __init__(
self,
vocab,
model,
name="parser",
moves=None,
*,
update_with_oracle_cut_size=100,
min_action_freq=30,
learn_tokens=False,
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
multitasks=tuple(),
incorrect_spans_key=None,
):
"""Create a DependencyParser.
"""
super().__init__(
vocab,
model,
name,
moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
min_action_freq=min_action_freq,
learn_tokens=learn_tokens,
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key,
)
@property
def postprocesses(self):
output = [nonproj.deprojectivize]

View File

@ -3,6 +3,7 @@ from collections import defaultdict
from typing import Optional, Iterable
from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem
from .transition_parser cimport Parser
from ._parser_internals.ner cimport BiluoPushDown
@ -40,6 +41,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"incorrect_spans_key": None
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
@ -48,8 +50,9 @@ def make_ner(
nlp: Language,
name: str,
model: Model,
moves: Optional[list],
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
incorrect_spans_key: Optional[str]=None
):
"""Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens.
@ -67,13 +70,16 @@ def make_ner(
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (list[str]): A list of transition names. Inferred from the data if not
provided.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key.
"""
return EntityRecognizer(
nlp.vocab,
@ -81,9 +87,8 @@ def make_ner(
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
incorrect_spans_key=incorrect_spans_key,
multitasks=[],
min_action_freq=1,
learn_tokens=False,
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
@ -98,7 +103,8 @@ def make_ner(
"model": DEFAULT_NER_MODEL,
"beam_density": 0.01,
"beam_update_prob": 0.5,
"beam_width": 32
"beam_width": 32,
"incorrect_spans_key": None
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
)
@ -106,11 +112,12 @@ def make_beam_ner(
nlp: Language,
name: str,
model: Model,
moves: Optional[list],
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
beam_width: int,
beam_density: float,
beam_update_prob: float,
incorrect_spans_key: Optional[str]=None
):
"""Create a transition-based EntityRecognizer component that uses beam-search.
The entity recognizer identifies non-overlapping labelled spans of tokens.
@ -128,13 +135,13 @@ def make_beam_ner(
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (list[str]): A list of transition names. Inferred from the data if not
provided.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
beam_width (int): The number of candidate analyses to maintain.
beam_density (float): The minimum ratio between the scores of the first and
last candidates in the beam. This allows the parser to avoid exploring
@ -144,6 +151,8 @@ def make_beam_ner(
beam_update_prob (float): The chance of making a beam update, instead of a
greedy update. Greedy updates are an approximation for the beam updates,
and are faster to compute.
incorrect_spans_key (Optional[str]): Optional key into span groups of
entities known to be non-entities.
"""
return EntityRecognizer(
nlp.vocab,
@ -152,11 +161,10 @@ def make_beam_ner(
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=[],
min_action_freq=1,
learn_tokens=False,
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
incorrect_spans_key=incorrect_spans_key
)
@ -167,6 +175,37 @@ cdef class EntityRecognizer(Parser):
"""
TransitionSystem = BiluoPushDown
def __init__(
self,
vocab,
model,
name="ner",
moves=None,
*,
update_with_oracle_cut_size=100,
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
multitasks=tuple(),
incorrect_spans_key=None,
):
"""Create an EntityRecognizer.
"""
super().__init__(
vocab,
model,
name,
moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
min_action_freq=1, # not relevant for NER
learn_tokens=False, # not relevant for NER
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key,
)
def add_multitask_objective(self, mt_component):
"""Register another component as a multi-task objective. Experimental."""
self._multitasks.append(mt_component)

View File

@ -29,6 +29,7 @@ from ..training import validate_examples, validate_get_examples
from ..errors import Errors, Warnings
from .. import util
cdef class Parser(TrainablePipe):
"""
Base class of the DependencyParser and EntityRecognizer.
@ -48,15 +49,43 @@ cdef class Parser(TrainablePipe):
beam_density=0.0,
beam_update_prob=0.0,
multitasks=tuple(),
incorrect_spans_key=None
):
"""Create a Parser.
vocab (Vocab): The vocabulary object. Must be shared with documents
to be processed. The value is set to the `.vocab` attribute.
**cfg: Configuration parameters. Set to the `.cfg` attribute.
If it doesn't include a value for 'moves', a new instance is
created with `self.TransitionSystem()`. This defines how the
parse-state is created, updated and evaluated.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
name (str): The name of the pipeline component
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
min_action_freq (int): The minimum frequency of labelled actions to retain.
Rarer labelled actions have their label backed-off to "dep". While this
primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity
transformation.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
beam_width (int): The number of candidate analyses to maintain.
beam_density (float): The minimum ratio between the scores of the first and
last candidates in the beam. This allows the parser to avoid exploring
candidates that are too far behind. This is mostly intended to improve
efficiency, but it can also improve accuracy as deeper search is not
always better.
beam_update_prob (float): The chance of making a beam update, instead of a
greedy update. Greedy updates are an approximation for the beam updates,
and are faster to compute.
multitasks: additional multi-tasking components. Experimental.
incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key.
"""
self.vocab = vocab
self.name = name
@ -68,11 +97,16 @@ cdef class Parser(TrainablePipe):
"learn_tokens": learn_tokens,
"beam_width": beam_width,
"beam_density": beam_density,
"beam_update_prob": beam_update_prob
"beam_update_prob": beam_update_prob,
"incorrect_spans_key": incorrect_spans_key
}
if moves is None:
# defined by EntityRecognizer as a BiluoPushDown
moves = self.TransitionSystem(self.vocab.strings)
# EntityRecognizer -> BiluoPushDown
# DependencyParser -> ArcEager
moves = self.TransitionSystem(
self.vocab.strings,
incorrect_spans_key=incorrect_spans_key
)
self.moves = moves
self.model = model
if self.moves.n_moves != 0:
@ -118,6 +152,10 @@ cdef class Parser(TrainablePipe):
# Available for subclasses, e.g. to deprojectivize
return []
@property
def incorrect_spans_key(self):
return self.cfg["incorrect_spans_key"]
def add_label(self, label):
resized = False
for action in self.moves.action_types:
@ -326,7 +364,6 @@ cdef class Parser(TrainablePipe):
)
for multitask in self._multitasks:
multitask.update(examples, drop=drop, sgd=sgd)
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
if n_examples == 0:
return losses
@ -554,7 +591,7 @@ cdef class Parser(TrainablePipe):
self._resize()
self.model.from_bytes(bytes_data)
except AttributeError:
raise ValueError(Errors.E149) from None
raise ValueError(Errors.E149)
return self
def to_bytes(self, exclude=tuple()):

View File

@ -18,14 +18,9 @@ def _ner_example(ner):
def test_doc_add_entities_set_ents_iob(en_vocab):
text = ["This", "is", "a", "lion"]
doc = Doc(en_vocab, words=text)
config = {
"learn_tokens": False,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config)
ner = EntityRecognizer(en_vocab, model)
ner.initialize(lambda: [_ner_example(ner)])
ner(doc)
@ -40,14 +35,9 @@ def test_ents_reset(en_vocab):
"""Ensure that resetting doc.ents does not change anything"""
text = ["This", "is", "a", "lion"]
doc = Doc(en_vocab, words=text)
config = {
"learn_tokens": False,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config)
ner = EntityRecognizer(en_vocab, model)
ner.initialize(lambda: [_ner_example(ner)])
ner(doc)
orig_iobs = [t.ent_iob_ for t in doc]

View File

@ -18,14 +18,9 @@ def vocab():
@pytest.fixture
def parser(vocab):
config = {
"learn_tokens": False,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
parser = DependencyParser(vocab, model, **config)
parser = DependencyParser(vocab, model)
return parser
@ -77,19 +72,14 @@ def test_add_label(parser):
def test_add_label_deserializes_correctly():
config = {
"learn_tokens": False,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner1 = EntityRecognizer(Vocab(), model, **config)
ner1 = EntityRecognizer(Vocab(), model)
ner1.add_label("C")
ner1.add_label("B")
ner1.add_label("A")
ner1.initialize(lambda: [_ner_example(ner1)])
ner2 = EntityRecognizer(Vocab(), model, **config)
ner2 = EntityRecognizer(Vocab(), model)
# the second model needs to be resized before we can call from_bytes
ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves)
@ -113,12 +103,7 @@ def test_add_label_get_label(pipe_cls, n_moves, model_config):
"""
labels = ["A", "B", "C"]
model = registry.resolve({"model": model_config}, validate=True)["model"]
config = {
"learn_tokens": False,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
pipe = pipe_cls(Vocab(), model, **config)
pipe = pipe_cls(Vocab(), model)
for label in labels:
pipe.add_label(label)
assert len(pipe.move_names) == len(labels) * n_moves

View File

@ -130,14 +130,9 @@ def test_get_oracle_actions():
deps.append(dep)
ents.append(ent)
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
config = {
"learn_tokens": False,
"min_action_freq": 0,
"update_with_oracle_cut_size": 100,
}
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
parser = DependencyParser(doc.vocab, model, **config)
parser = DependencyParser(doc.vocab, model)
parser.moves.add_action(0, "")
parser.moves.add_action(1, "")
parser.moves.add_action(1, "")

View File

@ -9,11 +9,12 @@ from spacy.lookups import Lookups
from spacy.pipeline._parser_internals.ner import BiluoPushDown
from spacy.training import Example
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from spacy.vocab import Vocab, registry
import logging
from ..util import make_tempdir
from ...pipeline import EntityRecognizer
from ...pipeline.ner import DEFAULT_NER_MODEL
TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
@ -21,6 +22,11 @@ TRAIN_DATA = [
]
@pytest.fixture
def neg_key():
return "non_entities"
@pytest.fixture
def vocab():
return Vocab()
@ -59,39 +65,70 @@ def test_get_oracle_moves(tsys, doc, entity_annots):
assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
def test_negative_samples_two_word_input(tsys, vocab, neg_key):
"""Test that we don't get stuck in a two word input when we have a negative
span. This could happen if we don't have the right check on the B action.
"""
tsys.cfg["neg_key"] = neg_key
doc = Doc(vocab, words=["A", "B"])
entity_annots = [None, None]
example = Example.from_dict(doc, {"entities": entity_annots})
ex_dict = example.to_dict()
for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]):
if tag == "L-!GPE":
ex_dict["doc_annotation"]["entities"][i] = "-"
example = Example.from_dict(doc, ex_dict)
# These mean that the oracle sequence shouldn't have O for the first
# word, and it shouldn't analyse it as B-PERSON, L-PERSON
example.y.spans[neg_key] = [
Span(example.y, 0, 1, label="O"),
Span(example.y, 0, 2, label="PERSON"),
]
act_classes = tsys.get_oracle_sequence(example)
names = [tsys.get_class_name(act) for act in act_classes]
assert names
assert names[0] != "O"
assert names[0] != "B-PERSON"
assert names[1] != "L-PERSON"
def test_get_oracle_moves_negative_entities2(tsys, vocab):
doc = Doc(vocab, words=["A", "B", "C", "D"])
entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
def test_negative_samples_three_word_input(tsys, vocab, neg_key):
"""Test that we exclude a 2-word entity correctly using a negative example."""
tsys.cfg["neg_key"] = neg_key
doc = Doc(vocab, words=["A", "B", "C"])
entity_annots = [None, None, None]
example = Example.from_dict(doc, {"entities": entity_annots})
# These mean that the oracle sequence shouldn't have O for the first
# word, and it shouldn't analyse it as B-PERSON, L-PERSON
example.y.spans[neg_key] = [
Span(example.y, 0, 1, label="O"),
Span(example.y, 0, 2, label="PERSON"),
]
act_classes = tsys.get_oracle_sequence(example)
names = [tsys.get_class_name(act) for act in act_classes]
assert names
assert names[0] != "O"
assert names[1] != "B-PERSON"
@pytest.mark.skip(reason="Maybe outdated? Unsure")
def test_get_oracle_moves_negative_O(tsys, vocab):
doc = Doc(vocab, words=["A", "B", "C", "D"])
entity_annots = ["O", "!O", "O", "!O"]
def test_negative_samples_U_entity(tsys, vocab, neg_key):
"""Test that we exclude a 2-word entity correctly using a negative example."""
tsys.cfg["neg_key"] = neg_key
doc = Doc(vocab, words=["A"])
entity_annots = [None]
example = Example.from_dict(doc, {"entities": entity_annots})
# These mean that the oracle sequence shouldn't have O for the first
# word, and it shouldn't analyse it as B-PERSON, L-PERSON
example.y.spans[neg_key] = [
Span(example.y, 0, 1, label="O"),
Span(example.y, 0, 1, label="PERSON"),
]
act_classes = tsys.get_oracle_sequence(example)
names = [tsys.get_class_name(act) for act in act_classes]
assert names
assert names[0] != "O"
assert names[0] != "U-PERSON"
def test_negative_sample_key_is_in_config(vocab, entity_types):
actions = BiluoPushDown.get_actions(entity_types=entity_types)
tsys = BiluoPushDown(vocab.strings, actions, incorrect_spans_key="non_entities")
assert tsys.cfg["neg_key"] == "non_entities"
# We can't easily represent this on a Doc object. Not sure what the best solution
@ -213,6 +250,27 @@ def test_train_empty():
nlp.update(batch, losses=losses)
def test_train_negative_deprecated():
"""Test that the deprecated negative entity format raises a custom error."""
train_data = [
("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}),
]
nlp = English()
train_examples = []
for t in train_data:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
ner = nlp.add_pipe("ner", last=True)
ner.add_label("PERSON")
nlp.initialize()
for itn in range(2):
losses = {}
batches = util.minibatch(train_examples, size=8)
for batch in batches:
with pytest.raises(ValueError):
nlp.update(batch, losses=losses)
def test_overwrite_token():
nlp = English()
nlp.add_pipe("ner")
@ -265,6 +323,16 @@ def test_ruler_before_ner():
assert [token.ent_type_ for token in doc] == expected_types
def test_ner_constructor(en_vocab):
config = {
"update_with_oracle_cut_size": 100,
}
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner_1 = EntityRecognizer(en_vocab, model, **config)
ner_2 = EntityRecognizer(en_vocab, model)
def test_ner_before_ruler():
""" Test that an entity_ruler works after an NER: the second can overwrite O annotations """
nlp = English()
@ -414,7 +482,7 @@ def test_beam_ner_scores():
assert 0 - eps <= score <= 1 + eps
def test_beam_overfitting_IO():
def test_beam_overfitting_IO(neg_key):
# Simple test to try and quickly overfit the Beam NER component
nlp = English()
beam_width = 16
@ -422,6 +490,7 @@ def test_beam_overfitting_IO():
config = {
"beam_width": beam_width,
"beam_density": beam_density,
"incorrect_spans_key": neg_key,
}
ner = nlp.add_pipe("beam_ner", config=config)
train_examples = []
@ -438,12 +507,13 @@ def test_beam_overfitting_IO():
assert losses["beam_ner"] < 0.0001
# test the scores from the beam
test_text = "I like London."
test_text = "I like London"
docs = [nlp.make_doc(test_text)]
beams = ner.predict(docs)
entity_scores = ner.scored_ents(beams)[0]
assert entity_scores[(2, 3, "LOC")] == 1.0
assert entity_scores[(2, 3, "PERSON")] == 0.0
assert len(nlp(test_text).ents) == 1
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
@ -456,6 +526,104 @@ def test_beam_overfitting_IO():
assert entity_scores2[(2, 3, "LOC")] == 1.0
assert entity_scores2[(2, 3, "PERSON")] == 0.0
# Try to unlearn the entity by using negative annotations
neg_doc = nlp.make_doc(test_text)
neg_ex = Example(neg_doc, neg_doc)
neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")]
neg_train_examples = [neg_ex]
for i in range(20):
losses = {}
nlp.update(neg_train_examples, sgd=optimizer, losses=losses)
# test the "untrained" model
assert len(nlp(test_text).ents) == 0
def test_neg_annotation(neg_key):
"""Check that the NER update works with a negative annotation that is a different label of the correct one,
or partly overlapping, etc"""
nlp = English()
beam_width = 16
beam_density = 0.0001
config = {
"beam_width": beam_width,
"beam_density": beam_density,
"incorrect_spans_key": neg_key,
}
ner = nlp.add_pipe("beam_ner", config=config)
train_text = "Who is Shaka Khan?"
neg_doc = nlp.make_doc(train_text)
ner.add_label("PERSON")
ner.add_label("ORG")
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "ORG"), Span(neg_doc, 2, 3, "PERSON"), Span(neg_doc, 1, 4, "PERSON")]
optimizer = nlp.initialize()
for i in range(2):
losses = {}
nlp.update([example], sgd=optimizer, losses=losses)
def test_neg_annotation_conflict(neg_key):
# Check that NER raises for a negative annotation that is THE SAME as a correct one
nlp = English()
beam_width = 16
beam_density = 0.0001
config = {
"beam_width": beam_width,
"beam_density": beam_density,
"incorrect_spans_key": neg_key,
}
ner = nlp.add_pipe("beam_ner", config=config)
train_text = "Who is Shaka Khan?"
neg_doc = nlp.make_doc(train_text)
ner.add_label("PERSON")
ner.add_label("LOC")
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")]
assert len(example.reference.ents) == 1
assert example.reference.ents[0].text == "Shaka Khan"
assert example.reference.ents[0].label_ == "PERSON"
assert len(example.reference.spans[neg_key]) == 1
assert example.reference.spans[neg_key][0].text == "Shaka Khan"
assert example.reference.spans[neg_key][0].label_ == "PERSON"
optimizer = nlp.initialize()
for i in range(2):
losses = {}
with pytest.raises(ValueError):
nlp.update([example], sgd=optimizer, losses=losses)
def test_beam_valid_parse(neg_key):
"""Regression test for previously flakey behaviour"""
nlp = English()
beam_width = 16
beam_density = 0.0001
config = {
"beam_width": beam_width,
"beam_density": beam_density,
"incorrect_spans_key": neg_key,
}
nlp.add_pipe("beam_ner", config=config)
# fmt: off
tokens = ['FEDERAL', 'NATIONAL', 'MORTGAGE', 'ASSOCIATION', '(', 'Fannie', 'Mae', '):', 'Posted', 'yields', 'on', '30', 'year', 'mortgage', 'commitments', 'for', 'delivery', 'within', '30', 'days', '(', 'priced', 'at', 'par', ')', '9.75', '%', ',', 'standard', 'conventional', 'fixed', '-', 'rate', 'mortgages', ';', '8.70', '%', ',', '6/2', 'rate', 'capped', 'one', '-', 'year', 'adjustable', 'rate', 'mortgages', '.', 'Source', ':', 'Telerate', 'Systems', 'Inc.']
iob = ['B-ORG', 'I-ORG', 'I-ORG', 'L-ORG', 'O', 'B-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'U-CARDINAL', 'O', 'O', 'B-DATE', 'I-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
# fmt: on
doc = Doc(nlp.vocab, words=tokens)
example = Example.from_dict(doc, {"ner": iob})
neg_span = Span(doc, 50, 53, "ORG")
example.reference.spans[neg_key] = [neg_span]
optimizer = nlp.initialize()
for i in range(5):
losses = {}
nlp.update([example], sgd=optimizer, losses=losses)
assert "beam_ner" in losses
def test_ner_warns_no_lookups(caplog):
nlp = English()

View File

@ -5,10 +5,11 @@ from spacy.attrs import DEP
from spacy.lang.en import English
from spacy.training import Example
from spacy.tokens import Doc
from spacy import util
from spacy import util, registry
from ..util import apply_transition_sequence, make_tempdir
from ...pipeline import DependencyParser
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
TRAIN_DATA = [
(
@ -215,6 +216,18 @@ def test_parser_set_sent_starts(en_vocab):
assert token.head in sent
def test_parser_constructor(en_vocab):
config = {
"learn_tokens": False,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
parser_1 = DependencyParser(en_vocab, model, **config)
parser_2 = DependencyParser(en_vocab, model)
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
def test_incomplete_data(pipe_name):
# Test that the parser works with incomplete information

View File

@ -23,14 +23,9 @@ def _parser_example(parser):
@pytest.fixture
def parser(vocab):
vocab.strings.add("ROOT")
config = {
"learn_tokens": False,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
parser = DependencyParser(vocab, model, **config)
parser = DependencyParser(vocab, model)
parser.cfg["token_vector_width"] = 4
parser.cfg["hidden_width"] = 32
# parser.add_label('right')

View File

@ -190,14 +190,9 @@ def test_issue3345():
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
doc[4].is_sent_start = True
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
config = {
"learn_tokens": False,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(doc.vocab, model, **config)
ner = EntityRecognizer(doc.vocab, model)
# Add the OUT action. I wouldn't have thought this would be necessary...
ner.moves.add_action(5, "")
ner.add_label("GPE")

View File

@ -259,8 +259,6 @@ def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens"""
config = {
"learn_tokens": False,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
parser = DependencyParser(Vocab(), model, **config)
@ -274,8 +272,6 @@ def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True."""
config = {
"learn_tokens": True,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
parser = DependencyParser(Vocab(), model, **config)

View File

@ -61,8 +61,6 @@ def taggers(en_vocab):
@pytest.mark.parametrize("Parser", test_parsers)
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
config = {
"learn_tokens": False,
"min_action_freq": 0,
"update_with_oracle_cut_size": 100,
"beam_width": 1,
"beam_update_prob": 1.0,
@ -70,8 +68,8 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
}
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
parser = Parser(en_vocab, model, **config)
new_parser = Parser(en_vocab, model, **config)
parser = Parser(en_vocab, model)
new_parser = Parser(en_vocab, model)
new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"]))
bytes_2 = new_parser.to_bytes(exclude=["vocab"])
bytes_3 = parser.to_bytes(exclude=["vocab"])
@ -84,43 +82,27 @@ def test_serialize_parser_strings(Parser):
vocab1 = Vocab()
label = "FunnyLabel"
assert label not in vocab1.strings
config = {
"learn_tokens": False,
"min_action_freq": 0,
"update_with_oracle_cut_size": 100,
"beam_width": 1,
"beam_update_prob": 1.0,
"beam_density": 0.0,
}
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
parser1 = Parser(vocab1, model, **config)
parser1 = Parser(vocab1, model)
parser1.add_label(label)
assert label in parser1.vocab.strings
vocab2 = Vocab()
assert label not in vocab2.strings
parser2 = Parser(vocab2, model, **config)
parser2 = Parser(vocab2, model)
parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
assert label in parser2.vocab.strings
@pytest.mark.parametrize("Parser", test_parsers)
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
config = {
"learn_tokens": False,
"min_action_freq": 0,
"update_with_oracle_cut_size": 100,
"beam_width": 1,
"beam_update_prob": 1.0,
"beam_density": 0.0,
}
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
parser = Parser(en_vocab, model, **config)
parser = Parser(en_vocab, model)
with make_tempdir() as d:
file_path = d / "parser"
parser.to_disk(file_path)
parser_d = Parser(en_vocab, model, **config)
parser_d = Parser(en_vocab, model)
parser_d = parser_d.from_disk(file_path)
parser_bytes = parser.to_bytes(exclude=["model", "vocab"])
parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"])
@ -198,17 +180,12 @@ def test_serialize_textcat_empty(en_vocab):
def test_serialize_pipe_exclude(en_vocab, Parser):
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
config = {
"learn_tokens": False,
"min_action_freq": 0,
"update_with_oracle_cut_size": 100,
}
def get_new_parser():
new_parser = Parser(en_vocab, model, **config)
new_parser = Parser(en_vocab, model)
return new_parser
parser = Parser(en_vocab, model, **config)
parser = Parser(en_vocab, model)
parser.cfg["foo"] = "bar"
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]))
assert "foo" in new_parser.cfg

View File

@ -235,9 +235,9 @@ cdef class Example:
seen.update(indices)
return output
def get_aligned_ner(self):
def get_aligned_ents_and_ner(self):
if not self.y.has_annotation("ENT_IOB"):
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
return [], [None] * len(self.x)
x_ents = self.get_aligned_spans_y2x(self.y.ents, allow_overlap=False)
# Default to 'None' for missing values
x_tags = offsets_to_biluo_tags(
@ -253,6 +253,10 @@ cdef class Example:
x_tags[i] = "O"
elif self.x[i].is_space:
x_tags[i] = "O"
return x_ents, x_tags
def get_aligned_ner(self):
x_ents, x_tags = self.get_aligned_ents_and_ner()
return x_tags
def to_dict(self):

View File

@ -50,7 +50,7 @@ architectures and their arguments and hyperparameters.
| Setting | Description |
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[TransitionSystem]~~ |
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ |
| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ |
@ -88,8 +88,8 @@ shortcut for this and instantiate the component using its string name and
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ |
| _keyword-only_ | |
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ |
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. ~~bool~~ |
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ |
| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |
## DependencyParser.\_\_call\_\_ {#call tag="method"}

View File

@ -37,6 +37,7 @@ architectures and their arguments and hyperparameters.
> "moves": None,
> "update_with_oracle_cut_size": 100,
> "model": DEFAULT_NER_MODEL,
> "incorrect_spans_key": "incorrect_spans",
> }
> nlp.add_pipe("ner", config=config)
> ```
@ -46,6 +47,7 @@ architectures and their arguments and hyperparameters.
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
| `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/ner.pyx
@ -72,14 +74,15 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
| Name | Description |
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ |
| _keyword-only_ | |
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ |
| Name | Description |
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| `moves` | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[List[str]]~~ |
| _keyword-only_ | |
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `incorrect_spans_key` | Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group, under this key. Defaults to `None`. ~~Optional[str]~~ |
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
@ -220,14 +223,14 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
> losses = ner.update(examples, sgd=optimizer)
> ```
| Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## EntityRecognizer.get_loss {#get_loss tag="method"}