From 05803cfe7649adefddd2dd95eb069ab9495dd173 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Fri, 8 Dec 2023 14:38:05 +0100 Subject: [PATCH] Revert "Reimplement distillation with oracle cut size (#12214)" This reverts commit e27c60a70263f7ab17968964de37e938653e37a2. --- spacy/ml/tb_framework.pyx | 4 +- spacy/pipeline/transition_parser.pyx | 105 ++++++--------------------- spacy/tests/parser/test_model.py | 61 ---------------- spacy/tests/parser/test_ner.py | 5 +- spacy/tests/parser/test_parse.py | 5 +- 5 files changed, 24 insertions(+), 156 deletions(-) delete mode 100644 spacy/tests/parser/test_model.py diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx index 6c5c29d85..e497643f0 100644 --- a/spacy/ml/tb_framework.pyx +++ b/spacy/ml/tb_framework.pyx @@ -267,11 +267,9 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states, cdef np.ndarray step_actions scores = [] - while sizes.states >= 1 and (actions is None or len(actions) > 0): + while sizes.states >= 1: step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f") step_actions = actions[0] if actions is not None else None - assert step_actions is None or step_actions.size == sizes.states, \ - f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})" with nogil: _predict_states(cblas, &activations, step_scores.data, states, &weights, sizes) if actions is None: diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 2496d1376..1231e439e 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -43,10 +43,6 @@ from ..training import ( from ._parser_internals import _beam_utils -# TODO: Remove when we switch to Cython 3. -cdef extern from "" namespace "std" nogil: - bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except + - NUMPY_OPS = NumpyOps() @@ -265,8 +261,8 @@ class Parser(TrainablePipe): # batch uniform length. Since we do not have a gold standard # sequence, we use the teacher's predictions as the gold # standard. - max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2)) - states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves) + max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) + states = self._init_batch(teacher_pipe, student_docs, max_moves) else: states = self.moves.init_batch(student_docs) @@ -277,14 +273,12 @@ class Parser(TrainablePipe): # gradients of the student's transition distributions relative to the # teacher's distributions. - student_inputs = TransitionModelInputs(docs=student_docs, - states=[state.copy() for state in states], - moves=self.moves, - max_moves=max_moves) + student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves, + max_moves=max_moves) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) - actions = _states_diff_to_actions(states, student_states) + actions = states2actions(student_states) teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples], - states=states, moves=teacher_pipe.moves, actions=actions) + moves=self.moves, actions=actions) (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs) loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores) @@ -532,7 +526,7 @@ class Parser(TrainablePipe): set_dropout_rate(self.model, 0.0) student_inputs = TransitionModelInputs(docs=docs, moves=self.moves) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) - actions = _states_to_actions(student_states) + actions = states2actions(student_states) teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions) _, teacher_scores = self._rehearsal_model.predict(teacher_inputs) @@ -652,7 +646,7 @@ class Parser(TrainablePipe): raise ValueError(Errors.E149) from None return self - def _init_batch_from_teacher(self, teacher_pipe, docs, max_length): + def _init_batch(self, teacher_step_model, docs, max_length): """Make a square batch of length equal to the shortest transition sequence or a cap. A long doc will get multiple states. Let's say we have a doc of length 2*N, @@ -661,12 +655,10 @@ class Parser(TrainablePipe): _init_gold_batch, this version uses a teacher model to generate the cut sequences.""" cdef: + StateClass start_state StateClass state - TransitionSystem moves = teacher_pipe.moves - - # Start with the same heuristic as in supervised training: exclude - # docs that are within the maximum length. - all_states = moves.init_batch(docs) + Transition action + all_states = self.moves.init_batch(docs) states = [] to_cut = [] for state, doc in zip(all_states, docs): @@ -675,30 +667,19 @@ class Parser(TrainablePipe): states.append(state) else: to_cut.append(state) - - if not to_cut: - return states - - # Parse the states that are too long with the teacher's parsing model. - teacher_inputs = TransitionModelInputs(docs=docs, - moves=moves, - states=[state.copy() for state in to_cut]) - (teacher_states, _) = teacher_pipe.model.predict(teacher_inputs) - - # Step through the teacher's actions and store every state after - # each multiple of max_length. - teacher_actions = _states_to_actions(teacher_states) while to_cut: states.extend(state.copy() for state in to_cut) - for step_actions in teacher_actions[:max_length]: - to_cut = moves.apply_actions(to_cut, step_actions) - teacher_actions = teacher_actions[max_length:] - - if len(teacher_actions) < max_length: - break - + # Move states forward max_length actions. + length = 0 + while to_cut and length < max_length: + teacher_scores = teacher_step_model.predict(to_cut) + self.transition_states(to_cut, teacher_scores) + # States that are completed do not need further cutting. + to_cut = [state for state in to_cut if not state.is_final()] + length += 1 return states + def _init_gold_batch(self, examples, max_length): """Make a square batch, of length equal to the shortest transition sequence or a cap. A long doc will get multiple states. Let's say we @@ -759,7 +740,7 @@ def _change_attrs(model, **kwargs): model.attrs[key] = value -def _states_to_actions(states: List[StateClass]) -> List[Ints1d]: +def states2actions(states: List[StateClass]) -> List[Ints1d]: cdef int step cdef StateClass state cdef StateC* c_state @@ -780,47 +761,3 @@ def _states_to_actions(states: List[StateClass]) -> List[Ints1d]: actions.append(numpy.array(step_actions, dtype="i")) return actions - - -def _states_diff_to_actions( - before_states: List[StateClass], - after_states: List[StateClass] -) -> List[Ints1d]: - """ - Return for two sets of states the actions to go from the first set of - states to the second set of states. The histories of the first set of - states must be a prefix of the second set of states. - """ - cdef StateClass before_state, after_state - cdef StateC* c_state_before - cdef StateC* c_state_after - - assert len(before_states) == len(after_states) - - # Check invariant: before states histories must be prefixes of after states. - for before_state, after_state in zip(before_states, after_states): - c_state_before = before_state.c - c_state_after = after_state.c - - assert equal(c_state_before.history.begin(), - c_state_before.history.end(), - c_state_after.history.begin()) - - actions = [] - while True: - step = len(actions) - - step_actions = [] - for before_state, after_state in zip(before_states, after_states): - c_state_before = before_state.c - c_state_after = after_state.c - if step < c_state_after.history.size() - c_state_before.history.size(): - step_actions.append(c_state_after.history[c_state_before.history.size() + step]) - - # We are done if we have exhausted all histories. - if len(step_actions) == 0: - break - - actions.append(numpy.array(step_actions, dtype="i")) - - return actions diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py deleted file mode 100644 index 8c1cf7a93..000000000 --- a/spacy/tests/parser/test_model.py +++ /dev/null @@ -1,61 +0,0 @@ -import numpy -import pytest - -from spacy.lang.en import English -from spacy.ml.tb_framework import TransitionModelInputs -from spacy.training import Example - -TRAIN_DATA = [ - ( - "They trade mortgage-backed securities.", - { - "heads": [1, 1, 4, 4, 5, 1, 1], - "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"], - }, - ), - ( - "I like London and Berlin.", - { - "heads": [1, 1, 1, 2, 2, 1], - "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], - }, - ), -] - - -@pytest.fixture -def nlp_parser(): - nlp = English() - parser = nlp.add_pipe("parser") - - train_examples = [] - for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for dep in annotations["deps"]: - parser.add_label(dep) - nlp.initialize() - - return nlp, parser - - -def test_incorrect_number_of_actions(nlp_parser): - nlp, parser = nlp_parser - doc = nlp.make_doc("test") - - # Too many actions for the number of docs - with pytest.raises(AssertionError): - parser.model.predict( - TransitionModelInputs( - docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")] - ) - ) - - # Too few actions for the number of docs - with pytest.raises(AssertionError): - parser.model.predict( - TransitionModelInputs( - docs=[doc, doc], - moves=parser.moves, - actions=[numpy.array([0], dtype="i")], - ) - ) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index f0efc3a63..bb9b7653c 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -623,9 +623,7 @@ def test_is_distillable(): assert ner.is_distillable -@pytest.mark.slow -@pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) -def test_distill(max_moves): +def test_distill(): teacher = English() teacher_ner = teacher.add_pipe("ner") train_examples = [] @@ -643,7 +641,6 @@ def test_distill(max_moves): student = English() student_ner = student.add_pipe("ner") - student_ner.cfg["update_with_oracle_cut_size"] = max_moves student_ner.initialize( get_examples=lambda: train_examples, labels=teacher_ner.label_data ) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 4837d65f5..2f6c77ba8 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -462,9 +462,7 @@ def test_is_distillable(): assert parser.is_distillable -@pytest.mark.slow -@pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) -def test_distill(max_moves): +def test_distill(): teacher = English() teacher_parser = teacher.add_pipe("parser") train_examples = [] @@ -482,7 +480,6 @@ def test_distill(max_moves): student = English() student_parser = student.add_pipe("parser") - student_parser.cfg["update_with_oracle_cut_size"] = max_moves student_parser.initialize( get_examples=lambda: train_examples, labels=teacher_parser.label_data )