diff --git a/spacy/errors.py b/spacy/errors.py index 71539800e..b5d11c1f7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -521,6 +521,13 @@ class Errors: E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x + E868 = ("Found a conflicting gold annotation in a reference document, " + "with the following char-based span occurring both in the gold ents " + "as well as in the negative spans: {span}.") + E869 = ("The notation '{label}' is not supported anymore. To annotate " + "negative NER samples, use `doc.spans[key]` instead, and " + "specify the key as 'incorrect_spans_key' when constructing " + "the NER component.") E870 = ("Could not serialize the DocBin because it is too large. Consider " "splitting up your documents into several doc bins and serializing " "each separately. spacy.Corpus.v1 will search recursively for all " diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx index 4b0d07725..3edeff19a 100644 --- a/spacy/pipeline/_parser_internals/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -1,3 +1,5 @@ +import os +import random from libc.stdint cimport int32_t from cymem.cymem cimport Pool @@ -6,10 +8,11 @@ from thinc.extra.search cimport Beam from ...tokens.doc cimport Doc from ...tokens.span import Span +from ...tokens.span cimport Span from ...typedefs cimport weight_t, attr_t from ...lexeme cimport Lexeme from ...attrs cimport IS_SPACE -from ...structs cimport TokenC +from ...structs cimport TokenC, SpanC from ...training.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC @@ -25,7 +28,6 @@ cdef enum: LAST UNIT OUT - ISNT N_MOVES @@ -36,39 +38,62 @@ MOVE_NAMES[IN] = 'I' MOVE_NAMES[LAST] = 'L' MOVE_NAMES[UNIT] = 'U' MOVE_NAMES[OUT] = 'O' -MOVE_NAMES[ISNT] = 'x' cdef struct GoldNERStateC: Transition* ner + SpanC* negs int32_t length + int32_t nr_neg cdef class BiluoGold: cdef Pool mem cdef GoldNERStateC c - def __init__(self, BiluoPushDown moves, StateClass stcls, Example example): + def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key): self.mem = Pool() - self.c = create_gold_state(self.mem, moves, stcls.c, example) + self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key) def update(self, StateClass stcls): update_gold_state(&self.c, stcls.c) - cdef GoldNERStateC create_gold_state( Pool mem, BiluoPushDown moves, const StateC* stcls, - Example example + Example example, + neg_key ) except *: cdef GoldNERStateC gs + cdef Span neg + if neg_key is not None: + negs = example.get_aligned_spans_y2x( + example.y.spans.get(neg_key, []), + allow_overlap=True + ) + else: + negs = [] assert example.x.length > 0 gs.ner = mem.alloc(example.x.length, sizeof(Transition)) - ner_tags = example.get_aligned_ner() + gs.negs = mem.alloc(len(negs), sizeof(SpanC)) + gs.nr_neg = len(negs) + ner_ents, ner_tags = example.get_aligned_ents_and_ner() for i, ner_tag in enumerate(ner_tags): gs.ner[i] = moves.lookup_transition(ner_tag) + + # Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label. + neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs} + for pos_span in ner_ents: + if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples: + raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_))) + + # In order to handle negative samples, we need to maintain the full + # (start, end, label) triple. If we break it down to the 'isnt B-LOC' + # thing, we'll get blocked if there's an incorrect prefix. + for i, neg in enumerate(negs): + gs.negs[i] = neg.c return gs @@ -156,21 +181,16 @@ cdef class BiluoPushDown(TransitionSystem): cdef attr_t label if name == '-' or name == '' or name is None: return Transition(clas=0, move=MISSING, label=0, score=0) - elif name == '!O': - return Transition(clas=0, move=ISNT, label=0, score=0) elif '-' in name: move_str, label_str = name.split('-', 1) - # Hacky way to denote 'not this entity' + # Deprecated, hacky way to denote 'not this entity' if label_str.startswith('!'): - label_str = label_str[1:] - move_str = 'x' + raise ValueError(Errors.E869.format(label=name)) label = self.strings.add(label_str) else: move_str = name label = 0 move = MOVE_NAMES.index(move_str) - if move == ISNT: - return Transition(clas=0, move=ISNT, label=label, score=0) for i in range(self.n_moves): if self.c[i].move == move and self.c[i].label == label: return self.c[i] @@ -220,7 +240,7 @@ cdef class BiluoPushDown(TransitionSystem): label_id = label_name if action == OUT and label_id != 0: return None - if action == MISSING or action == ISNT: + if action == MISSING: return None # Check we're not creating a move we already have, so that this is # idempotent @@ -270,9 +290,23 @@ cdef class BiluoPushDown(TransitionSystem): return parses def init_gold(self, StateClass state, Example example): - return BiluoGold(self, state, example) + return BiluoGold(self, state, example, self.neg_key) def has_gold(self, Example eg, start=0, end=None): + # We get x and y referring to X, we want to check relative to Y, + # the reference + y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]]) + if not y_spans: + y_spans = [eg.y[:]] + y_span = y_spans[0] + start = y_span.start + end = y_span.end + neg_key = self.neg_key + if neg_key is not None: + # If we have any negative samples, count that as having annotation. + for span in eg.y.spans.get(neg_key, []): + if span.start >= start and span.end <= end: + return True for word in eg.y[start:end]: if word.ent_iob != 0: return True @@ -306,8 +340,6 @@ cdef class BiluoPushDown(TransitionSystem): n_gold += costs[i] <= 0 else: costs[i] = 9000 - if n_gold < 1: - raise ValueError cdef class Missing: @@ -373,23 +405,33 @@ cdef class Begin: @staticmethod cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: gold = _gold - cdef int g_act = gold.ner[s.B(0)].move - cdef attr_t g_tag = gold.ner[s.B(0)].label + b0 = s.B(0) + cdef int cost = 0 + cdef int g_act = gold.ner[b0].move + cdef attr_t g_tag = gold.ner[b0].label if g_act == MISSING: - return 0 + pass elif g_act == BEGIN: # B, Gold B --> Label match - return label != g_tag - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return label == g_tag + cost += label != g_tag else: # B, Gold I --> False (P) # B, Gold L --> False (P) # B, Gold O --> False (P) # B, Gold U --> False (P) - return 1 + cost += 1 + if s.buffer_length() < 3: + # Handle negatives. In general we can't really do much to block + # B, because we don't know whether the whole entity is going to + # be correct or not. However, we can at least tell whether we're + # going to be opening an entity where there's only one possible + # L. + for span in gold.negs[:gold.nr_neg]: + if span.label == label and span.start == b0: + cost += 1 + break + return cost cdef class In: @@ -462,9 +504,6 @@ cdef class In: elif g_act == UNIT: # I, Gold U --> True iff next tag == O return next_act != OUT - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return 0 else: return 1 @@ -504,32 +543,41 @@ cdef class Last: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil: gold = _gold move = LAST + b0 = s.B(0) + ent_start = s.E(0) - cdef int g_act = gold.ner[s.B(0)].move - cdef attr_t g_tag = gold.ner[s.B(0)].label + cdef int g_act = gold.ner[b0].move + cdef attr_t g_tag = gold.ner[b0].label + + cdef int cost = 0 if g_act == MISSING: - return 0 + pass elif g_act == BEGIN: # L, Gold B --> True - return 0 + pass elif g_act == IN: # L, Gold I --> True iff this entity sunk - return not _entity_is_sunk(s, gold.ner) + cost += not _entity_is_sunk(s, gold.ner) elif g_act == LAST: # L, Gold L --> True - return 0 + pass elif g_act == OUT: # L, Gold O --> True - return 0 + pass elif g_act == UNIT: # L, Gold U --> True - return 0 - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return 0 + pass else: - return 1 + cost += 1 + # If we have negative-example entities, integrate them into the objective, + # by marking actions that close an entity that we know is incorrect + # as costly. + for span in gold.negs[:gold.nr_neg]: + if span.label == label and (span.end-1) == b0 and span.start == ent_start: + cost += 1 + break + return cost cdef class Unit: @@ -573,21 +621,29 @@ cdef class Unit: gold = _gold cdef int g_act = gold.ner[s.B(0)].move cdef attr_t g_tag = gold.ner[s.B(0)].label + cdef int cost = 0 if g_act == MISSING: - return 0 + pass elif g_act == UNIT: # U, Gold U --> True iff tag match - return label != g_tag - # Support partial supervision in the form of "not this label" - elif g_act == ISNT: - return label == g_tag + cost += label != g_tag else: # U, Gold B --> False # U, Gold I --> False # U, Gold L --> False # U, Gold O --> False - return 1 + cost += 1 + # If we have negative-example entities, integrate them into the objective. + # This is fairly straight-forward for U- entities, as we have a single + # action + cdef int b0 = s.B(0) + for span in gold.negs[:gold.nr_neg]: + if span.label == label and span.start == b0 and span.end == (b0+1): + cost += 1 + break + return cost + cdef class Out: @@ -613,25 +669,24 @@ cdef class Out: gold = _gold cdef int g_act = gold.ner[s.B(0)].move cdef attr_t g_tag = gold.ner[s.B(0)].label - - if g_act == ISNT and g_tag == 0: - return 1 - elif g_act == MISSING or g_act == ISNT: - return 0 + cdef weight_t cost = 0 + if g_act == MISSING: + pass elif g_act == BEGIN: # O, Gold B --> False - return 1 + cost += 1 elif g_act == IN: # O, Gold I --> True - return 0 + pass elif g_act == LAST: # O, Gold L --> True - return 0 + pass elif g_act == OUT: # O, Gold O --> True - return 0 + pass elif g_act == UNIT: # O, Gold U --> False - return 1 + cost += 1 else: - return 1 + cost += 1 + return cost diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd index eed347b98..52ebd2b8e 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pxd +++ b/spacy/pipeline/_parser_internals/transition_system.pxd @@ -41,6 +41,7 @@ cdef class TransitionSystem: cdef public attr_t root_label cdef public freqs cdef public object labels + cdef public object cfg cdef init_state_t init_beam_state cdef del_state_t del_beam_state diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 9e6f847eb..18eb745a9 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -33,7 +33,14 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1: cdef class TransitionSystem: - def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None): + def __init__( + self, + StringStore string_table, + labels_by_action=None, + min_freq=None, + incorrect_spans_key=None + ): + self.cfg = {"neg_key": incorrect_spans_key} self.mem = Pool() self.strings = string_table self.n_moves = 0 @@ -49,8 +56,13 @@ cdef class TransitionSystem: self.del_beam_state = _del_state def __reduce__(self): + # TODO: This loses the 'cfg' return (self.__class__, (self.strings, self.labels), None, None) + @property + def neg_key(self): + return self.cfg.get("neg_key") + def init_batch(self, docs): cdef StateClass state states = [] @@ -220,16 +232,21 @@ cdef class TransitionSystem: transitions = [] serializers = { 'moves': lambda: srsly.json_dumps(self.labels), - 'strings': lambda: self.strings.to_bytes() + 'strings': lambda: self.strings.to_bytes(), + 'cfg': lambda: self.cfg } return util.to_bytes(serializers, exclude) def from_bytes(self, bytes_data, exclude=tuple()): + # We're adding a new field, 'cfg', here and we don't want to break + # previous models that don't have it. + msg = srsly.msgpack_loads(bytes_data) labels = {} - deserializers = { - 'moves': lambda b: labels.update(srsly.json_loads(b)), - 'strings': lambda b: self.strings.from_bytes(b) - } - msg = util.from_bytes(bytes_data, deserializers, exclude) + if 'moves' not in exclude: + labels.update(srsly.json_loads(msg['moves'])) + if 'strings' not in exclude: + self.strings.from_bytes(msg['strings']) + if 'cfg' not in exclude and 'cfg' in msg: + self.cfg.update(msg['cfg']) self.initialize_actions(labels) return self diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 37f09ce3a..be23ab0dd 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -3,6 +3,7 @@ from collections import defaultdict from typing import Optional, Iterable from thinc.api import Model, Config +from ._parser_internals.transition_system import TransitionSystem from .transition_parser cimport Parser from ._parser_internals.arc_eager cimport ArcEager @@ -59,7 +60,7 @@ def make_parser( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, learn_tokens: bool, min_action_freq: int @@ -85,13 +86,13 @@ def make_parser( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (List[str]): A list of transition names. Inferred from the data if not - provided. - update_with_oracle_cut_size (int): - During training, cut long sequences into shorter segments by creating - intermediate states based on the gold-standard history. The model is - not very sensitive to this parameter, so you usually won't need to change - it. 100 is a good default. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. learn_tokens (bool): Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. min_action_freq (int): The minimum frequency of labelled actions to retain. @@ -112,6 +113,9 @@ def make_parser( beam_width=1, beam_density=0.0, beam_update_prob=0.0, + # At some point in the future we can try to implement support for + # partial annotations, perhaps only in the beam objective. + incorrect_spans_key=None ) @Language.factory( @@ -140,7 +144,7 @@ def make_beam_parser( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, learn_tokens: bool, min_action_freq: int, @@ -165,8 +169,13 @@ def make_beam_parser( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (List[str]): A list of transition names. Inferred from the data if not - provided. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. beam_width (int): The number of candidate analyses to maintain. beam_density (float): The minimum ratio between the scores of the first and last candidates in the beam. This allows the parser to avoid exploring @@ -195,7 +204,10 @@ def make_beam_parser( beam_update_prob=beam_update_prob, multitasks=[], learn_tokens=learn_tokens, - min_action_freq=min_action_freq + min_action_freq=min_action_freq, + # At some point in the future we can try to implement support for + # partial annotations, perhaps only in the beam objective. + incorrect_spans_key=None ) @@ -206,6 +218,39 @@ cdef class DependencyParser(Parser): """ TransitionSystem = ArcEager + def __init__( + self, + vocab, + model, + name="parser", + moves=None, + *, + update_with_oracle_cut_size=100, + min_action_freq=30, + learn_tokens=False, + beam_width=1, + beam_density=0.0, + beam_update_prob=0.0, + multitasks=tuple(), + incorrect_spans_key=None, + ): + """Create a DependencyParser. + """ + super().__init__( + vocab, + model, + name, + moves, + update_with_oracle_cut_size=update_with_oracle_cut_size, + min_action_freq=min_action_freq, + learn_tokens=learn_tokens, + beam_width=beam_width, + beam_density=beam_density, + beam_update_prob=beam_update_prob, + multitasks=multitasks, + incorrect_spans_key=incorrect_spans_key, + ) + @property def postprocesses(self): output = [nonproj.deprojectivize] diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 0b9b0d324..f4ae4b787 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -3,6 +3,7 @@ from collections import defaultdict from typing import Optional, Iterable from thinc.api import Model, Config +from ._parser_internals.transition_system import TransitionSystem from .transition_parser cimport Parser from ._parser_internals.ner cimport BiluoPushDown @@ -40,6 +41,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "moves": None, "update_with_oracle_cut_size": 100, "model": DEFAULT_NER_MODEL, + "incorrect_spans_key": None }, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, @@ -48,8 +50,9 @@ def make_ner( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, + incorrect_spans_key: Optional[str]=None ): """Create a transition-based EntityRecognizer component. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -67,13 +70,16 @@ def make_ner( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (list[str]): A list of transition names. Inferred from the data if not - provided. - update_with_oracle_cut_size (int): - During training, cut long sequences into shorter segments by creating - intermediate states based on the gold-standard history. The model is - not very sensitive to this parameter, so you usually won't need to change - it. 100 is a good default. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. + incorrect_spans_key (Optional[str]): Identifies spans that are known + to be incorrect entity annotations. The incorrect entity annotations + can be stored in the span group, under this key. """ return EntityRecognizer( nlp.vocab, @@ -81,9 +87,8 @@ def make_ner( name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, + incorrect_spans_key=incorrect_spans_key, multitasks=[], - min_action_freq=1, - learn_tokens=False, beam_width=1, beam_density=0.0, beam_update_prob=0.0, @@ -98,7 +103,8 @@ def make_ner( "model": DEFAULT_NER_MODEL, "beam_density": 0.01, "beam_update_prob": 0.5, - "beam_width": 32 + "beam_width": 32, + "incorrect_spans_key": None }, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, ) @@ -106,11 +112,12 @@ def make_beam_ner( nlp: Language, name: str, model: Model, - moves: Optional[list], + moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, beam_width: int, beam_density: float, beam_update_prob: float, + incorrect_spans_key: Optional[str]=None ): """Create a transition-based EntityRecognizer component that uses beam-search. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -128,13 +135,13 @@ def make_beam_ner( model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. - moves (list[str]): A list of transition names. Inferred from the data if not - provided. - update_with_oracle_cut_size (int): - During training, cut long sequences into shorter segments by creating - intermediate states based on the gold-standard history. The model is - not very sensitive to this parameter, so you usually won't need to change - it. 100 is a good default. + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. beam_width (int): The number of candidate analyses to maintain. beam_density (float): The minimum ratio between the scores of the first and last candidates in the beam. This allows the parser to avoid exploring @@ -144,6 +151,8 @@ def make_beam_ner( beam_update_prob (float): The chance of making a beam update, instead of a greedy update. Greedy updates are an approximation for the beam updates, and are faster to compute. + incorrect_spans_key (Optional[str]): Optional key into span groups of + entities known to be non-entities. """ return EntityRecognizer( nlp.vocab, @@ -152,11 +161,10 @@ def make_beam_ner( moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, multitasks=[], - min_action_freq=1, - learn_tokens=False, beam_width=beam_width, beam_density=beam_density, beam_update_prob=beam_update_prob, + incorrect_spans_key=incorrect_spans_key ) @@ -167,6 +175,37 @@ cdef class EntityRecognizer(Parser): """ TransitionSystem = BiluoPushDown + def __init__( + self, + vocab, + model, + name="ner", + moves=None, + *, + update_with_oracle_cut_size=100, + beam_width=1, + beam_density=0.0, + beam_update_prob=0.0, + multitasks=tuple(), + incorrect_spans_key=None, + ): + """Create an EntityRecognizer. + """ + super().__init__( + vocab, + model, + name, + moves, + update_with_oracle_cut_size=update_with_oracle_cut_size, + min_action_freq=1, # not relevant for NER + learn_tokens=False, # not relevant for NER + beam_width=beam_width, + beam_density=beam_density, + beam_update_prob=beam_update_prob, + multitasks=multitasks, + incorrect_spans_key=incorrect_spans_key, + ) + def add_multitask_objective(self, mt_component): """Register another component as a multi-task objective. Experimental.""" self._multitasks.append(mt_component) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 4de57d311..a495b1bc7 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -29,6 +29,7 @@ from ..training import validate_examples, validate_get_examples from ..errors import Errors, Warnings from .. import util + cdef class Parser(TrainablePipe): """ Base class of the DependencyParser and EntityRecognizer. @@ -48,15 +49,43 @@ cdef class Parser(TrainablePipe): beam_density=0.0, beam_update_prob=0.0, multitasks=tuple(), + incorrect_spans_key=None ): """Create a Parser. vocab (Vocab): The vocabulary object. Must be shared with documents to be processed. The value is set to the `.vocab` attribute. - **cfg: Configuration parameters. Set to the `.cfg` attribute. - If it doesn't include a value for 'moves', a new instance is - created with `self.TransitionSystem()`. This defines how the - parse-state is created, updated and evaluated. + model (Model): The model for the transition-based parser. The model needs + to have a specific substructure of named components --- see the + spacy.ml.tb_framework.TransitionModel for details. + name (str): The name of the pipeline component + moves (Optional[TransitionSystem]): This defines how the parse-state is created, + updated and evaluated. If 'moves' is None, a new instance is + created with `self.TransitionSystem()`. Defaults to `None`. + update_with_oracle_cut_size (int): During training, cut long sequences into + shorter segments by creating intermediate states based on the gold-standard + history. The model is not very sensitive to this parameter, so you usually + won't need to change it. 100 is a good default. + min_action_freq (int): The minimum frequency of labelled actions to retain. + Rarer labelled actions have their label backed-off to "dep". While this + primarily affects the label accuracy, it can also affect the attachment + structure, as the labels are used to represent the pseudo-projectivity + transformation. + learn_tokens (bool): Whether to learn to merge subtokens that are split + relative to the gold standard. Experimental. + beam_width (int): The number of candidate analyses to maintain. + beam_density (float): The minimum ratio between the scores of the first and + last candidates in the beam. This allows the parser to avoid exploring + candidates that are too far behind. This is mostly intended to improve + efficiency, but it can also improve accuracy as deeper search is not + always better. + beam_update_prob (float): The chance of making a beam update, instead of a + greedy update. Greedy updates are an approximation for the beam updates, + and are faster to compute. + multitasks: additional multi-tasking components. Experimental. + incorrect_spans_key (Optional[str]): Identifies spans that are known + to be incorrect entity annotations. The incorrect entity annotations + can be stored in the span group, under this key. """ self.vocab = vocab self.name = name @@ -68,11 +97,16 @@ cdef class Parser(TrainablePipe): "learn_tokens": learn_tokens, "beam_width": beam_width, "beam_density": beam_density, - "beam_update_prob": beam_update_prob + "beam_update_prob": beam_update_prob, + "incorrect_spans_key": incorrect_spans_key } if moves is None: - # defined by EntityRecognizer as a BiluoPushDown - moves = self.TransitionSystem(self.vocab.strings) + # EntityRecognizer -> BiluoPushDown + # DependencyParser -> ArcEager + moves = self.TransitionSystem( + self.vocab.strings, + incorrect_spans_key=incorrect_spans_key + ) self.moves = moves self.model = model if self.moves.n_moves != 0: @@ -118,6 +152,10 @@ cdef class Parser(TrainablePipe): # Available for subclasses, e.g. to deprojectivize return [] + @property + def incorrect_spans_key(self): + return self.cfg["incorrect_spans_key"] + def add_label(self, label): resized = False for action in self.moves.action_types: @@ -326,7 +364,6 @@ cdef class Parser(TrainablePipe): ) for multitask in self._multitasks: multitask.update(examples, drop=drop, sgd=sgd) - n_examples = len([eg for eg in examples if self.moves.has_gold(eg)]) if n_examples == 0: return losses @@ -554,7 +591,7 @@ cdef class Parser(TrainablePipe): self._resize() self.model.from_bytes(bytes_data) except AttributeError: - raise ValueError(Errors.E149) from None + raise ValueError(Errors.E149) return self def to_bytes(self, exclude=tuple()): diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index fa0206fdd..231b7c2a8 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -18,14 +18,9 @@ def _ner_example(ner): def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(en_vocab, model, **config) + ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) @@ -40,14 +35,9 @@ def test_ents_reset(en_vocab): """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(en_vocab, model, **config) + ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) orig_iobs = [t.ent_iob_ for t in doc] diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index e955a12a8..f89e993e9 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -18,14 +18,9 @@ def vocab(): @pytest.fixture def parser(vocab): - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(vocab, model, **config) + parser = DependencyParser(vocab, model) return parser @@ -77,19 +72,14 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner1 = EntityRecognizer(Vocab(), model, **config) + ner1 = EntityRecognizer(Vocab(), model) ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") ner1.initialize(lambda: [_ner_example(ner1)]) - ner2 = EntityRecognizer(Vocab(), model, **config) + ner2 = EntityRecognizer(Vocab(), model) # the second model needs to be resized before we can call from_bytes ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves) @@ -113,12 +103,7 @@ def test_add_label_get_label(pipe_cls, n_moves, model_config): """ labels = ["A", "B", "C"] model = registry.resolve({"model": model_config}, validate=True)["model"] - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } - pipe = pipe_cls(Vocab(), model, **config) + pipe = pipe_cls(Vocab(), model) for label in labels: pipe.add_label(label) assert len(pipe.move_names) == len(labels) * n_moves diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 66c22c60b..cba6fa81e 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -130,14 +130,9 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(doc.vocab, model, **config) + parser = DependencyParser(doc.vocab, model) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index d7c37fbd1..eccfbf174 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -9,11 +9,12 @@ from spacy.lookups import Lookups from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.training import Example from spacy.tokens import Doc, Span -from spacy.vocab import Vocab +from spacy.vocab import Vocab, registry import logging from ..util import make_tempdir - +from ...pipeline import EntityRecognizer +from ...pipeline.ner import DEFAULT_NER_MODEL TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), @@ -21,6 +22,11 @@ TRAIN_DATA = [ ] +@pytest.fixture +def neg_key(): + return "non_entities" + + @pytest.fixture def vocab(): return Vocab() @@ -59,39 +65,70 @@ def test_get_oracle_moves(tsys, doc, entity_annots): assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"] -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots): - entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots] +def test_negative_samples_two_word_input(tsys, vocab, neg_key): + """Test that we don't get stuck in a two word input when we have a negative + span. This could happen if we don't have the right check on the B action. + """ + tsys.cfg["neg_key"] = neg_key + doc = Doc(vocab, words=["A", "B"]) + entity_annots = [None, None] example = Example.from_dict(doc, {"entities": entity_annots}) - ex_dict = example.to_dict() - - for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]): - if tag == "L-!GPE": - ex_dict["doc_annotation"]["entities"][i] = "-" - example = Example.from_dict(doc, ex_dict) - + # These mean that the oracle sequence shouldn't have O for the first + # word, and it shouldn't analyse it as B-PERSON, L-PERSON + example.y.spans[neg_key] = [ + Span(example.y, 0, 1, label="O"), + Span(example.y, 0, 2, label="PERSON"), + ] act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names + assert names[0] != "O" + assert names[0] != "B-PERSON" + assert names[1] != "L-PERSON" -def test_get_oracle_moves_negative_entities2(tsys, vocab): - doc = Doc(vocab, words=["A", "B", "C", "D"]) - entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"] +def test_negative_samples_three_word_input(tsys, vocab, neg_key): + """Test that we exclude a 2-word entity correctly using a negative example.""" + tsys.cfg["neg_key"] = neg_key + doc = Doc(vocab, words=["A", "B", "C"]) + entity_annots = [None, None, None] example = Example.from_dict(doc, {"entities": entity_annots}) + # These mean that the oracle sequence shouldn't have O for the first + # word, and it shouldn't analyse it as B-PERSON, L-PERSON + example.y.spans[neg_key] = [ + Span(example.y, 0, 1, label="O"), + Span(example.y, 0, 2, label="PERSON"), + ] act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names + assert names[0] != "O" + assert names[1] != "B-PERSON" -@pytest.mark.skip(reason="Maybe outdated? Unsure") -def test_get_oracle_moves_negative_O(tsys, vocab): - doc = Doc(vocab, words=["A", "B", "C", "D"]) - entity_annots = ["O", "!O", "O", "!O"] +def test_negative_samples_U_entity(tsys, vocab, neg_key): + """Test that we exclude a 2-word entity correctly using a negative example.""" + tsys.cfg["neg_key"] = neg_key + doc = Doc(vocab, words=["A"]) + entity_annots = [None] example = Example.from_dict(doc, {"entities": entity_annots}) + # These mean that the oracle sequence shouldn't have O for the first + # word, and it shouldn't analyse it as B-PERSON, L-PERSON + example.y.spans[neg_key] = [ + Span(example.y, 0, 1, label="O"), + Span(example.y, 0, 1, label="PERSON"), + ] act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names + assert names[0] != "O" + assert names[0] != "U-PERSON" + + +def test_negative_sample_key_is_in_config(vocab, entity_types): + actions = BiluoPushDown.get_actions(entity_types=entity_types) + tsys = BiluoPushDown(vocab.strings, actions, incorrect_spans_key="non_entities") + assert tsys.cfg["neg_key"] == "non_entities" # We can't easily represent this on a Doc object. Not sure what the best solution @@ -213,6 +250,27 @@ def test_train_empty(): nlp.update(batch, losses=losses) +def test_train_negative_deprecated(): + """Test that the deprecated negative entity format raises a custom error.""" + train_data = [ + ("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}), + ] + + nlp = English() + train_examples = [] + for t in train_data: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + ner = nlp.add_pipe("ner", last=True) + ner.add_label("PERSON") + nlp.initialize() + for itn in range(2): + losses = {} + batches = util.minibatch(train_examples, size=8) + for batch in batches: + with pytest.raises(ValueError): + nlp.update(batch, losses=losses) + + def test_overwrite_token(): nlp = English() nlp.add_pipe("ner") @@ -265,6 +323,16 @@ def test_ruler_before_ner(): assert [token.ent_type_ for token in doc] == expected_types +def test_ner_constructor(en_vocab): + config = { + "update_with_oracle_cut_size": 100, + } + cfg = {"model": DEFAULT_NER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + ner_1 = EntityRecognizer(en_vocab, model, **config) + ner_2 = EntityRecognizer(en_vocab, model) + + def test_ner_before_ruler(): """ Test that an entity_ruler works after an NER: the second can overwrite O annotations """ nlp = English() @@ -414,7 +482,7 @@ def test_beam_ner_scores(): assert 0 - eps <= score <= 1 + eps -def test_beam_overfitting_IO(): +def test_beam_overfitting_IO(neg_key): # Simple test to try and quickly overfit the Beam NER component nlp = English() beam_width = 16 @@ -422,6 +490,7 @@ def test_beam_overfitting_IO(): config = { "beam_width": beam_width, "beam_density": beam_density, + "incorrect_spans_key": neg_key, } ner = nlp.add_pipe("beam_ner", config=config) train_examples = [] @@ -438,12 +507,13 @@ def test_beam_overfitting_IO(): assert losses["beam_ner"] < 0.0001 # test the scores from the beam - test_text = "I like London." + test_text = "I like London" docs = [nlp.make_doc(test_text)] beams = ner.predict(docs) entity_scores = ner.scored_ents(beams)[0] assert entity_scores[(2, 3, "LOC")] == 1.0 assert entity_scores[(2, 3, "PERSON")] == 0.0 + assert len(nlp(test_text).ents) == 1 # Also test the results are still the same after IO with make_tempdir() as tmp_dir: @@ -456,6 +526,104 @@ def test_beam_overfitting_IO(): assert entity_scores2[(2, 3, "LOC")] == 1.0 assert entity_scores2[(2, 3, "PERSON")] == 0.0 + # Try to unlearn the entity by using negative annotations + neg_doc = nlp.make_doc(test_text) + neg_ex = Example(neg_doc, neg_doc) + neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")] + neg_train_examples = [neg_ex] + + for i in range(20): + losses = {} + nlp.update(neg_train_examples, sgd=optimizer, losses=losses) + + # test the "untrained" model + assert len(nlp(test_text).ents) == 0 + + +def test_neg_annotation(neg_key): + """Check that the NER update works with a negative annotation that is a different label of the correct one, + or partly overlapping, etc""" + nlp = English() + beam_width = 16 + beam_density = 0.0001 + config = { + "beam_width": beam_width, + "beam_density": beam_density, + "incorrect_spans_key": neg_key, + } + ner = nlp.add_pipe("beam_ner", config=config) + train_text = "Who is Shaka Khan?" + neg_doc = nlp.make_doc(train_text) + ner.add_label("PERSON") + ner.add_label("ORG") + example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]}) + example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "ORG"), Span(neg_doc, 2, 3, "PERSON"), Span(neg_doc, 1, 4, "PERSON")] + + optimizer = nlp.initialize() + for i in range(2): + losses = {} + nlp.update([example], sgd=optimizer, losses=losses) + + +def test_neg_annotation_conflict(neg_key): + # Check that NER raises for a negative annotation that is THE SAME as a correct one + nlp = English() + beam_width = 16 + beam_density = 0.0001 + config = { + "beam_width": beam_width, + "beam_density": beam_density, + "incorrect_spans_key": neg_key, + } + ner = nlp.add_pipe("beam_ner", config=config) + train_text = "Who is Shaka Khan?" + neg_doc = nlp.make_doc(train_text) + ner.add_label("PERSON") + ner.add_label("LOC") + example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]}) + example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")] + assert len(example.reference.ents) == 1 + assert example.reference.ents[0].text == "Shaka Khan" + assert example.reference.ents[0].label_ == "PERSON" + assert len(example.reference.spans[neg_key]) == 1 + assert example.reference.spans[neg_key][0].text == "Shaka Khan" + assert example.reference.spans[neg_key][0].label_ == "PERSON" + + optimizer = nlp.initialize() + for i in range(2): + losses = {} + with pytest.raises(ValueError): + nlp.update([example], sgd=optimizer, losses=losses) + + +def test_beam_valid_parse(neg_key): + """Regression test for previously flakey behaviour""" + nlp = English() + beam_width = 16 + beam_density = 0.0001 + config = { + "beam_width": beam_width, + "beam_density": beam_density, + "incorrect_spans_key": neg_key, + } + nlp.add_pipe("beam_ner", config=config) + # fmt: off + tokens = ['FEDERAL', 'NATIONAL', 'MORTGAGE', 'ASSOCIATION', '(', 'Fannie', 'Mae', '):', 'Posted', 'yields', 'on', '30', 'year', 'mortgage', 'commitments', 'for', 'delivery', 'within', '30', 'days', '(', 'priced', 'at', 'par', ')', '9.75', '%', ',', 'standard', 'conventional', 'fixed', '-', 'rate', 'mortgages', ';', '8.70', '%', ',', '6/2', 'rate', 'capped', 'one', '-', 'year', 'adjustable', 'rate', 'mortgages', '.', 'Source', ':', 'Telerate', 'Systems', 'Inc.'] + iob = ['B-ORG', 'I-ORG', 'I-ORG', 'L-ORG', 'O', 'B-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'U-CARDINAL', 'O', 'O', 'B-DATE', 'I-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] + # fmt: on + + doc = Doc(nlp.vocab, words=tokens) + example = Example.from_dict(doc, {"ner": iob}) + neg_span = Span(doc, 50, 53, "ORG") + example.reference.spans[neg_key] = [neg_span] + + optimizer = nlp.initialize() + + for i in range(5): + losses = {} + nlp.update([example], sgd=optimizer, losses=losses) + assert "beam_ner" in losses + def test_ner_warns_no_lookups(caplog): nlp = English() diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index dc878dd7a..1b0d9d256 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -5,10 +5,11 @@ from spacy.attrs import DEP from spacy.lang.en import English from spacy.training import Example from spacy.tokens import Doc -from spacy import util +from spacy import util, registry from ..util import apply_transition_sequence, make_tempdir - +from ...pipeline import DependencyParser +from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL TRAIN_DATA = [ ( @@ -215,6 +216,18 @@ def test_parser_set_sent_starts(en_vocab): assert token.head in sent +def test_parser_constructor(en_vocab): + config = { + "learn_tokens": False, + "min_action_freq": 30, + "update_with_oracle_cut_size": 100, + } + cfg = {"model": DEFAULT_PARSER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + parser_1 = DependencyParser(en_vocab, model, **config) + parser_2 = DependencyParser(en_vocab, model) + + @pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) def test_incomplete_data(pipe_name): # Test that the parser works with incomplete information diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 595bfa537..d71388900 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -23,14 +23,9 @@ def _parser_example(parser): @pytest.fixture def parser(vocab): vocab.strings.add("ROOT") - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(vocab, model, **config) + parser = DependencyParser(vocab, model) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 362ba67ae..e123d2df9 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -190,14 +190,9 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - config = { - "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, - } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(doc.vocab, model, **config) + ner = EntityRecognizer(doc.vocab, model) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index 0505571c2..9d3a27435 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -259,8 +259,6 @@ def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" config = { "learn_tokens": False, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, } model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] parser = DependencyParser(Vocab(), model, **config) @@ -274,8 +272,6 @@ def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" config = { "learn_tokens": True, - "min_action_freq": 30, - "update_with_oracle_cut_size": 100, } model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] parser = DependencyParser(Vocab(), model, **config) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 48c7082bb..35cc22d24 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -61,8 +61,6 @@ def taggers(en_vocab): @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): config = { - "learn_tokens": False, - "min_action_freq": 0, "update_with_oracle_cut_size": 100, "beam_width": 1, "beam_update_prob": 1.0, @@ -70,8 +68,8 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = Parser(en_vocab, model, **config) - new_parser = Parser(en_vocab, model, **config) + parser = Parser(en_vocab, model) + new_parser = Parser(en_vocab, model) new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"])) bytes_2 = new_parser.to_bytes(exclude=["vocab"]) bytes_3 = parser.to_bytes(exclude=["vocab"]) @@ -84,43 +82,27 @@ def test_serialize_parser_strings(Parser): vocab1 = Vocab() label = "FunnyLabel" assert label not in vocab1.strings - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - "beam_width": 1, - "beam_update_prob": 1.0, - "beam_density": 0.0, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser1 = Parser(vocab1, model, **config) + parser1 = Parser(vocab1, model) parser1.add_label(label) assert label in parser1.vocab.strings vocab2 = Vocab() assert label not in vocab2.strings - parser2 = Parser(vocab2, model, **config) + parser2 = Parser(vocab2, model) parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"])) assert label in parser2.vocab.strings @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_disk(en_vocab, Parser): - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - "beam_width": 1, - "beam_update_prob": 1.0, - "beam_density": 0.0, - } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - parser = Parser(en_vocab, model, **config) + parser = Parser(en_vocab, model) with make_tempdir() as d: file_path = d / "parser" parser.to_disk(file_path) - parser_d = Parser(en_vocab, model, **config) + parser_d = Parser(en_vocab, model) parser_d = parser_d.from_disk(file_path) parser_bytes = parser.to_bytes(exclude=["model", "vocab"]) parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"]) @@ -198,17 +180,12 @@ def test_serialize_textcat_empty(en_vocab): def test_serialize_pipe_exclude(en_vocab, Parser): cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] - config = { - "learn_tokens": False, - "min_action_freq": 0, - "update_with_oracle_cut_size": 100, - } def get_new_parser(): - new_parser = Parser(en_vocab, model, **config) + new_parser = Parser(en_vocab, model) return new_parser - parser = Parser(en_vocab, model, **config) + parser = Parser(en_vocab, model) parser.cfg["foo"] = "bar" new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"])) assert "foo" in new_parser.cfg diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 07a83bfec..a1f3f98b3 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -235,9 +235,9 @@ cdef class Example: seen.update(indices) return output - def get_aligned_ner(self): + def get_aligned_ents_and_ner(self): if not self.y.has_annotation("ENT_IOB"): - return [None] * len(self.x) # should this be 'missing' instead of 'None' ? + return [], [None] * len(self.x) x_ents = self.get_aligned_spans_y2x(self.y.ents, allow_overlap=False) # Default to 'None' for missing values x_tags = offsets_to_biluo_tags( @@ -253,6 +253,10 @@ cdef class Example: x_tags[i] = "O" elif self.x[i].is_space: x_tags[i] = "O" + return x_ents, x_tags + + def get_aligned_ner(self): + x_ents, x_tags = self.get_aligned_ents_and_ner() return x_tags def to_dict(self): diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index c4e2e1697..f2d67b95c 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -50,7 +50,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | +| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[TransitionSystem]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ | @@ -88,8 +88,8 @@ shortcut for this and instantiate the component using its string name and | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ | | _keyword-only_ | | -| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ | -| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. ~~bool~~ | +| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | +| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ | ## DependencyParser.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 348736209..b237729be 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -37,6 +37,7 @@ architectures and their arguments and hyperparameters. > "moves": None, > "update_with_oracle_cut_size": 100, > "model": DEFAULT_NER_MODEL, +> "incorrect_spans_key": "incorrect_spans", > } > nlp.add_pipe("ner", config=config) > ``` @@ -46,6 +47,7 @@ architectures and their arguments and hyperparameters. | `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | +| `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/ner.pyx @@ -72,14 +74,15 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ | -| _keyword-only_ | | -| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ | +| Name | Description | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| `moves` | A list of transition names. Inferred from the data if set to `None`, which is the default. ~~Optional[List[str]]~~ | +| _keyword-only_ | | +| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | +| `incorrect_spans_key` | Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group, under this key. Defaults to `None`. ~~Optional[str]~~ | ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} @@ -220,14 +223,14 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and > losses = ner.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## EntityRecognizer.get_loss {#get_loss tag="method"}