From d5b1673790c8b5ab7a29455081bbb4612c83a8d0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 24 Jan 2021 23:54:36 +1100 Subject: [PATCH 01/74] Try to fix doc.copy --- spacy/tokens/doc.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 32f8c91fa..872a41356 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -261,11 +261,11 @@ cdef class Doc: cdef const LexemeC* lexeme for word, has_space in zip(words, spaces): if isinstance(word, unicode): - lexeme = self.vocab.get(self.mem, word) + lexeme = self.vocab.get(self.vocab.mem, word) elif isinstance(word, bytes): raise ValueError(Errors.E028.format(value=word)) else: - lexeme = self.vocab.get_by_orth(self.mem, word) + lexeme = self.vocab.get_by_orth(self.vocab.mem, word) self.push_back(lexeme, has_space) if heads is not None: @@ -1185,6 +1185,7 @@ cdef class Doc: other.user_hooks = dict(self.user_hooks) other.user_token_hooks = dict(self.user_token_hooks) other.user_span_hooks = dict(self.user_span_hooks) + other.spans = self.spans.copy() other.length = self.length other.max_length = self.max_length buff_size = other.max_length + (PADDING*2) @@ -1334,7 +1335,7 @@ cdef class Doc: end = start + attrs[i, 0] has_space = attrs[i, 1] orth_ = text[start:end] - lex = self.vocab.get(self.mem, orth_) + lex = self.vocab.get(self.vocab.mem, orth_) self.push_back(lex, has_space) start = end + has_space self.from_array(msg["array_head"][2:], attrs[:, 2:]) From 4048ca01ebb4e4018f8b28b4eb5a7abfc0577857 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 00:08:49 +1100 Subject: [PATCH 02/74] Set dev version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 35e27db7b..b5a080ed1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4" +__version__ = "3.0.0rc4.dev10" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 6117adcd6d2eaa7988ab3e9bfc0789881f1afe16 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 00:23:02 +1100 Subject: [PATCH 03/74] Make vocab always own lexemes --- spacy/vocab.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 8359d8452..e8ed1b61c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -161,8 +161,11 @@ cdef class Vocab: return self._new_lexeme(mem, self.strings[orth]) cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: - if len(string) < 3 or self.length < 10000: - mem = self.mem + #if len(string) < 3 or self.length < 10000: + # mem = self.mem + # TODO: Experiment with never allowing the Doc to own lexemes, to see + # if it solves the Doc.copy() issue. + mem = self.mem cdef bint is_oov = mem is not self.mem lex = mem.alloc(1, sizeof(LexemeC)) lex.orth = self.strings.add(string) From 8a22161b59067ad56e89b314156285275f82017d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 00:23:43 +1100 Subject: [PATCH 04/74] Change version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index b5a080ed1..f0601271e 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev10" +__version__ = "3.0.0rc4.dev11" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 492c94893781151c7b25d73a95cfc93cd5c9b014 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 00:51:38 +1100 Subject: [PATCH 05/74] Add SpanGroups.copy method --- spacy/tokens/_dict_proxies.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py index b10f6d484..bfc867ffa 100644 --- a/spacy/tokens/_dict_proxies.py +++ b/spacy/tokens/_dict_proxies.py @@ -33,6 +33,9 @@ class SpanGroups(UserDict): def _make_span_group(self, name: str, spans: Iterable["Span"]) -> SpanGroup: return SpanGroup(self.doc_ref(), name=name, spans=spans) + def copy(self) -> "SpanGroups": + return SpanGroup(self.doc_ref()).from_bytes(self.to_bytes()) + def to_bytes(self) -> bytes: # We don't need to serialize this as a dict, because the groups # know their names. From 827fb51e6ccd469f683cfb66bb8c71a74bdaeefb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 00:52:00 +1100 Subject: [PATCH 06/74] Fix set_annotations during Parser.update --- spacy/pipeline/transition_parser.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 8cb4ea15d..15b07e9b1 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -370,7 +370,11 @@ cdef class Parser(TrainablePipe): if sgd not in (None, False): self.finish_update(sgd) docs = [eg.predicted for eg in examples] - self.set_annotations(docs, all_states) + # TODO: Refactor so we don't have to parse twice like this (ugh) + # The issue is that we cut up the gold batch into sub-states, and that + # makes it hard to get the actual predicted transition sequence. + predicted_states = self.predict(docs) + self.set_annotations(docs, predicted_states) # Ugh, this is annoying. If we're working on GPU, we want to free the # memory ASAP. It seems that Python doesn't necessarily get around to # removing these in time if we don't explicitly delete? It's confusing. From 351ce600c5e08fd1bfd35c24c30034c6b47cff45 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 01:21:47 +1100 Subject: [PATCH 07/74] Fix dict proxy copy --- spacy/tokens/_dict_proxies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py index bfc867ffa..7b2d2d5b5 100644 --- a/spacy/tokens/_dict_proxies.py +++ b/spacy/tokens/_dict_proxies.py @@ -34,7 +34,7 @@ class SpanGroups(UserDict): return SpanGroup(self.doc_ref(), name=name, spans=spans) def copy(self) -> "SpanGroups": - return SpanGroup(self.doc_ref()).from_bytes(self.to_bytes()) + return SpanGroups(self.doc_ref()).from_bytes(self.to_bytes()) def to_bytes(self) -> bytes: # We don't need to serialize this as a dict, because the groups From 8f07e6c9012941324254ce8773ca91d4038a21c3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 01:22:06 +1100 Subject: [PATCH 08/74] Upd version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index f0601271e..b0b398547 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev11" +__version__ = "3.0.0rc4.dev12" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From bb15d5b22fac5ca48d1836b778bec18c7ec9b24d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 09:50:29 +1100 Subject: [PATCH 09/74] Fix copying SpanGroups --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 872a41356..66ad722b7 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1180,12 +1180,12 @@ cdef class Doc: other.tensor = copy.deepcopy(self.tensor) other.cats = copy.deepcopy(self.cats) other.user_data = copy.deepcopy(self.user_data) + other.spans = self.spans.copy() other.sentiment = self.sentiment other.has_unknown_spaces = self.has_unknown_spaces other.user_hooks = dict(self.user_hooks) other.user_token_hooks = dict(self.user_token_hooks) other.user_span_hooks = dict(self.user_span_hooks) - other.spans = self.spans.copy() other.length = self.length other.max_length = self.max_length buff_size = other.max_length + (PADDING*2) From c6df0eafd0046179c1c9fb7840074edf04e4721d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 09:50:48 +1100 Subject: [PATCH 10/74] Fix set_annotations in parser.update --- .../_parser_internals/_beam_utils.pyx | 6 ++- .../pipeline/_parser_internals/arc_eager.pyx | 5 +- .../_parser_internals/transition_system.pyx | 10 ++++ spacy/pipeline/transition_parser.pyx | 52 +++++++++++++------ 4 files changed, 55 insertions(+), 18 deletions(-) diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx index fa7df2056..ef4165505 100644 --- a/spacy/pipeline/_parser_internals/_beam_utils.pyx +++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx @@ -193,7 +193,11 @@ def update_beam(TransitionSystem moves, states, golds, model, int width, beam_de for i, (d_scores, bp_scores) in enumerate(zip(states_d_scores, backprops)): loss += (d_scores**2).mean() bp_scores(d_scores) - return loss + # Return the predicted sequence for each doc. + predicted_histories = [] + for i in range(len(pbeam)): + predicted_histories.append(pbeam[i].histories[0]) + return predicted_histories, loss def collect_states(beams, docs): diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 069b41170..7c3d6d275 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -638,16 +638,17 @@ cdef class ArcEager(TransitionSystem): return gold def init_gold_batch(self, examples): - # TODO: Projectivity? all_states = self.init_batch([eg.predicted for eg in examples]) golds = [] states = [] + docs = [] for state, eg in zip(all_states, examples): if self.has_gold(eg) and not state.is_final(): golds.append(self.init_gold(state, eg)) states.append(state) + docs.append(eg.x) n_steps = sum([len(s.queue) for s in states]) - return states, golds, n_steps + return states, golds, docs def _replace_unseen_labels(self, ArcEagerGold gold): backoff_label = self.strings["dep"] diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 9bb4f7f5f..287513a79 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -120,6 +120,16 @@ cdef class TransitionSystem: raise ValueError(Errors.E024) return history + def follow_history(self, doc, history): + """Get the state that results from following a sequence of actions.""" + cdef int clas + cdef StateClass state + state = self.init_batch([doc])[0] + for clas in history: + action = self.c[clas] + action.do(state.c, action.label) + return state + def apply_transition(self, StateClass state, name): if not self.is_valid(state, name): raise ValueError(Errors.E170.format(name=name)) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 15b07e9b1..b93565178 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -337,21 +337,22 @@ cdef class Parser(TrainablePipe): # Chop sequences into lengths of this many words, to make the # batch uniform length. max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) - states, golds, _ = self._init_gold_batch( + states, golds, max_moves, state2doc = self._init_gold_batch( examples, max_length=max_moves ) else: - states, golds, _ = self.moves.init_gold_batch(examples) + states, golds, state2doc = self.moves.init_gold_batch(examples) if not states: return losses model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples]) + histories = [[] for example in examples] all_states = list(states) - states_golds = list(zip(states, golds)) + states_golds = list(zip(states, golds, state2doc)) n_moves = 0 while states_golds: - states, golds = zip(*states_golds) + states, golds, state2doc = zip(*states_golds) scores, backprop = model.begin_update(states) d_scores = self.get_batch_loss(states, golds, scores, losses) # Note that the gradient isn't normalized by the batch size @@ -360,8 +361,13 @@ cdef class Parser(TrainablePipe): # be getting smaller gradients for states in long sequences. backprop(d_scores) # Follow the predicted action - self.transition_states(states, scores) - states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()] + actions = self.transition_states(states, scores) + for i, action in enumerate(actions): + histories[i].append(action) + states_golds = [ + s for s in zip(states, golds, state2doc) + if not s[0].is_final() + ] if max_moves >= 1 and n_moves >= max_moves: break n_moves += 1 @@ -370,11 +376,11 @@ cdef class Parser(TrainablePipe): if sgd not in (None, False): self.finish_update(sgd) docs = [eg.predicted for eg in examples] - # TODO: Refactor so we don't have to parse twice like this (ugh) - # The issue is that we cut up the gold batch into sub-states, and that - # makes it hard to get the actual predicted transition sequence. - predicted_states = self.predict(docs) - self.set_annotations(docs, predicted_states) + states = [ + self.moves.follow_history(doc, history) + for doc, history in zip(docs, histories) + ] + self.set_annotations(docs, self._get_states(docs, states)) # Ugh, this is annoying. If we're working on GPU, we want to free the # memory ASAP. It seems that Python doesn't necessarily get around to # removing these in time if we don't explicitly delete? It's confusing. @@ -435,13 +441,16 @@ cdef class Parser(TrainablePipe): def update_beam(self, examples, *, beam_width, drop=0., sgd=None, losses=None, beam_density=0.0): - states, golds, _ = self.moves.init_gold_batch(examples) + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + states, golds, docs = self.moves.init_gold_batch(examples) if not states: return losses # Prepare the stepwise model, and get the callback for finishing the batch model, backprop_tok2vec = self.model.begin_update( [eg.predicted for eg in examples]) - loss = _beam_utils.update_beam( + predicted_histories, loss = _beam_utils.update_beam( self.moves, states, golds, @@ -453,6 +462,12 @@ cdef class Parser(TrainablePipe): backprop_tok2vec(golds) if sgd is not None: self.finish_update(sgd) + states = [ + self.moves.follow_history(doc, history) + for doc, history in zip(docs, predicted_histories) + ] + self.set_annotations(docs, states) + return losses def get_batch_loss(self, states, golds, float[:, ::1] scores, losses): cdef StateClass state @@ -595,18 +610,24 @@ cdef class Parser(TrainablePipe): states = [] golds = [] to_cut = [] + # Return a list indicating the position in the batch that each state + # refers to. This lets us put together the full list of predicted + # histories. + state2doc = [] + doc2i = {eg.x: i for i, eg in enumerate(examples)} for state, eg in zip(all_states, examples): if self.moves.has_gold(eg) and not state.is_final(): gold = self.moves.init_gold(state, eg) if len(eg.x) < max_length: states.append(state) golds.append(gold) + state2doc.append(doc2i[eg.x]) else: oracle_actions = self.moves.get_oracle_sequence_from_state( state.copy(), gold) to_cut.append((eg, state, gold, oracle_actions)) if not to_cut: - return states, golds, 0 + return states, golds, 0, state2doc cdef int clas for eg, state, gold, oracle_actions in to_cut: for i in range(0, len(oracle_actions), max_length): @@ -619,6 +640,7 @@ cdef class Parser(TrainablePipe): if self.moves.has_gold(eg, start_state.B(0), state.B(0)): states.append(start_state) golds.append(gold) + state2doc.append(doc2i[eg.x]) if state.is_final(): break - return states, golds, max_length + return states, golds, max_length, state2doc From eb138c89edb306608826dca50619ea8a60de2b14 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 10:52:40 +1100 Subject: [PATCH 11/74] Fix parser set_annotations during update --- spacy/pipeline/transition_parser.pyx | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index b93565178..422246164 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -290,9 +290,6 @@ cdef class Parser(TrainablePipe): cdef void c_transition_batch(self, StateC** states, const float* scores, int nr_class, int batch_size) nogil: - # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc - with gil: - assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) is_valid = calloc(self.moves.n_moves, sizeof(int)) cdef int i, guess cdef Transition action @@ -310,6 +307,7 @@ cdef class Parser(TrainablePipe): def update(self, examples, *, drop=0., sgd=None, losses=None): cdef StateClass state + cdef Transition action if losses is None: losses = {} losses.setdefault(self.name, 0.) @@ -351,6 +349,9 @@ cdef class Parser(TrainablePipe): all_states = list(states) states_golds = list(zip(states, golds, state2doc)) n_moves = 0 + mem = Pool() + is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) + cdef float[::1] scores_row while states_golds: states, golds, state2doc = zip(*states_golds) scores, backprop = model.begin_update(states) @@ -360,10 +361,20 @@ cdef class Parser(TrainablePipe): # can't normalize by the number of states either, as then we'd # be getting smaller gradients for states in long sequences. backprop(d_scores) - # Follow the predicted action - actions = self.transition_states(states, scores) - for i, action in enumerate(actions): - histories[i].append(action) + # Ugh, we need to get the actions for the histories, so we're + # duplicating work that's being done in transition_states. This + # should be refactored. + scores_view = scores + for i, state in enumerate(states): + self.moves.set_valid(is_valid, state.c) + scores_row = scores[i] + guess = arg_max_if_valid(&scores_row[0], is_valid, scores.shape[1]) + if guess == -1: + raise ValueError("Could not find valid transition") + histories[state2doc[i]].append(guess) + # Follow the predicted action + action = self.moves.c[guess] + action.do(state.c, action.label) states_golds = [ s for s in zip(states, golds, state2doc) if not s[0].is_final() From 65f2270d597428386824c6d7be30e64ac33aeaa9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 11:22:43 +1100 Subject: [PATCH 12/74] Revert "Fix parser set_annotations during update" This reverts commit eb138c89edb306608826dca50619ea8a60de2b14. --- spacy/pipeline/transition_parser.pyx | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 422246164..b93565178 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -290,6 +290,9 @@ cdef class Parser(TrainablePipe): cdef void c_transition_batch(self, StateC** states, const float* scores, int nr_class, int batch_size) nogil: + # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc + with gil: + assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) is_valid = calloc(self.moves.n_moves, sizeof(int)) cdef int i, guess cdef Transition action @@ -307,7 +310,6 @@ cdef class Parser(TrainablePipe): def update(self, examples, *, drop=0., sgd=None, losses=None): cdef StateClass state - cdef Transition action if losses is None: losses = {} losses.setdefault(self.name, 0.) @@ -349,9 +351,6 @@ cdef class Parser(TrainablePipe): all_states = list(states) states_golds = list(zip(states, golds, state2doc)) n_moves = 0 - mem = Pool() - is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) - cdef float[::1] scores_row while states_golds: states, golds, state2doc = zip(*states_golds) scores, backprop = model.begin_update(states) @@ -361,20 +360,10 @@ cdef class Parser(TrainablePipe): # can't normalize by the number of states either, as then we'd # be getting smaller gradients for states in long sequences. backprop(d_scores) - # Ugh, we need to get the actions for the histories, so we're - # duplicating work that's being done in transition_states. This - # should be refactored. - scores_view = scores - for i, state in enumerate(states): - self.moves.set_valid(is_valid, state.c) - scores_row = scores[i] - guess = arg_max_if_valid(&scores_row[0], is_valid, scores.shape[1]) - if guess == -1: - raise ValueError("Could not find valid transition") - histories[state2doc[i]].append(guess) - # Follow the predicted action - action = self.moves.c[guess] - action.do(state.c, action.label) + # Follow the predicted action + actions = self.transition_states(states, scores) + for i, action in enumerate(actions): + histories[i].append(action) states_golds = [ s for s in zip(states, golds, state2doc) if not s[0].is_final() From c631c355d12fb20021a3cabd8cd2cc41142234a6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 11:22:57 +1100 Subject: [PATCH 13/74] Revert "Fix set_annotations in parser.update" This reverts commit c6df0eafd0046179c1c9fb7840074edf04e4721d. --- .../_parser_internals/_beam_utils.pyx | 6 +-- .../pipeline/_parser_internals/arc_eager.pyx | 5 +- .../_parser_internals/transition_system.pyx | 10 ---- spacy/pipeline/transition_parser.pyx | 52 ++++++------------- 4 files changed, 18 insertions(+), 55 deletions(-) diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx index ef4165505..fa7df2056 100644 --- a/spacy/pipeline/_parser_internals/_beam_utils.pyx +++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx @@ -193,11 +193,7 @@ def update_beam(TransitionSystem moves, states, golds, model, int width, beam_de for i, (d_scores, bp_scores) in enumerate(zip(states_d_scores, backprops)): loss += (d_scores**2).mean() bp_scores(d_scores) - # Return the predicted sequence for each doc. - predicted_histories = [] - for i in range(len(pbeam)): - predicted_histories.append(pbeam[i].histories[0]) - return predicted_histories, loss + return loss def collect_states(beams, docs): diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 7c3d6d275..069b41170 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -638,17 +638,16 @@ cdef class ArcEager(TransitionSystem): return gold def init_gold_batch(self, examples): + # TODO: Projectivity? all_states = self.init_batch([eg.predicted for eg in examples]) golds = [] states = [] - docs = [] for state, eg in zip(all_states, examples): if self.has_gold(eg) and not state.is_final(): golds.append(self.init_gold(state, eg)) states.append(state) - docs.append(eg.x) n_steps = sum([len(s.queue) for s in states]) - return states, golds, docs + return states, golds, n_steps def _replace_unseen_labels(self, ArcEagerGold gold): backoff_label = self.strings["dep"] diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 287513a79..9bb4f7f5f 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -120,16 +120,6 @@ cdef class TransitionSystem: raise ValueError(Errors.E024) return history - def follow_history(self, doc, history): - """Get the state that results from following a sequence of actions.""" - cdef int clas - cdef StateClass state - state = self.init_batch([doc])[0] - for clas in history: - action = self.c[clas] - action.do(state.c, action.label) - return state - def apply_transition(self, StateClass state, name): if not self.is_valid(state, name): raise ValueError(Errors.E170.format(name=name)) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index b93565178..15b07e9b1 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -337,22 +337,21 @@ cdef class Parser(TrainablePipe): # Chop sequences into lengths of this many words, to make the # batch uniform length. max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) - states, golds, max_moves, state2doc = self._init_gold_batch( + states, golds, _ = self._init_gold_batch( examples, max_length=max_moves ) else: - states, golds, state2doc = self.moves.init_gold_batch(examples) + states, golds, _ = self.moves.init_gold_batch(examples) if not states: return losses model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples]) - histories = [[] for example in examples] all_states = list(states) - states_golds = list(zip(states, golds, state2doc)) + states_golds = list(zip(states, golds)) n_moves = 0 while states_golds: - states, golds, state2doc = zip(*states_golds) + states, golds = zip(*states_golds) scores, backprop = model.begin_update(states) d_scores = self.get_batch_loss(states, golds, scores, losses) # Note that the gradient isn't normalized by the batch size @@ -361,13 +360,8 @@ cdef class Parser(TrainablePipe): # be getting smaller gradients for states in long sequences. backprop(d_scores) # Follow the predicted action - actions = self.transition_states(states, scores) - for i, action in enumerate(actions): - histories[i].append(action) - states_golds = [ - s for s in zip(states, golds, state2doc) - if not s[0].is_final() - ] + self.transition_states(states, scores) + states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()] if max_moves >= 1 and n_moves >= max_moves: break n_moves += 1 @@ -376,11 +370,11 @@ cdef class Parser(TrainablePipe): if sgd not in (None, False): self.finish_update(sgd) docs = [eg.predicted for eg in examples] - states = [ - self.moves.follow_history(doc, history) - for doc, history in zip(docs, histories) - ] - self.set_annotations(docs, self._get_states(docs, states)) + # TODO: Refactor so we don't have to parse twice like this (ugh) + # The issue is that we cut up the gold batch into sub-states, and that + # makes it hard to get the actual predicted transition sequence. + predicted_states = self.predict(docs) + self.set_annotations(docs, predicted_states) # Ugh, this is annoying. If we're working on GPU, we want to free the # memory ASAP. It seems that Python doesn't necessarily get around to # removing these in time if we don't explicitly delete? It's confusing. @@ -441,16 +435,13 @@ cdef class Parser(TrainablePipe): def update_beam(self, examples, *, beam_width, drop=0., sgd=None, losses=None, beam_density=0.0): - if losses is None: - losses = {} - losses.setdefault(self.name, 0.0) - states, golds, docs = self.moves.init_gold_batch(examples) + states, golds, _ = self.moves.init_gold_batch(examples) if not states: return losses # Prepare the stepwise model, and get the callback for finishing the batch model, backprop_tok2vec = self.model.begin_update( [eg.predicted for eg in examples]) - predicted_histories, loss = _beam_utils.update_beam( + loss = _beam_utils.update_beam( self.moves, states, golds, @@ -462,12 +453,6 @@ cdef class Parser(TrainablePipe): backprop_tok2vec(golds) if sgd is not None: self.finish_update(sgd) - states = [ - self.moves.follow_history(doc, history) - for doc, history in zip(docs, predicted_histories) - ] - self.set_annotations(docs, states) - return losses def get_batch_loss(self, states, golds, float[:, ::1] scores, losses): cdef StateClass state @@ -610,24 +595,18 @@ cdef class Parser(TrainablePipe): states = [] golds = [] to_cut = [] - # Return a list indicating the position in the batch that each state - # refers to. This lets us put together the full list of predicted - # histories. - state2doc = [] - doc2i = {eg.x: i for i, eg in enumerate(examples)} for state, eg in zip(all_states, examples): if self.moves.has_gold(eg) and not state.is_final(): gold = self.moves.init_gold(state, eg) if len(eg.x) < max_length: states.append(state) golds.append(gold) - state2doc.append(doc2i[eg.x]) else: oracle_actions = self.moves.get_oracle_sequence_from_state( state.copy(), gold) to_cut.append((eg, state, gold, oracle_actions)) if not to_cut: - return states, golds, 0, state2doc + return states, golds, 0 cdef int clas for eg, state, gold, oracle_actions in to_cut: for i in range(0, len(oracle_actions), max_length): @@ -640,7 +619,6 @@ cdef class Parser(TrainablePipe): if self.moves.has_gold(eg, start_state.B(0), state.B(0)): states.append(start_state) golds.append(gold) - state2doc.append(doc2i[eg.x]) if state.is_final(): break - return states, golds, max_length, state2doc + return states, golds, max_length From be155ead9b492fbeb438b8f6dcf80de9af6a91bd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 11:56:36 +1100 Subject: [PATCH 14/74] Fix set_annotations during parser update --- .../_parser_internals/transition_system.pyx | 8 ++++ spacy/pipeline/transition_parser.pyx | 47 +++++++++---------- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 9bb4f7f5f..61c4544e1 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -61,6 +61,14 @@ cdef class TransitionSystem: offset += len(doc) return states + def follow_history(self, doc, history): + cdef int clas + cdef StateClass state = StateClass(doc) + for clas in history: + action = self.c[clas] + action.do(state.c, action.label) + return state + def get_oracle_sequence(self, Example example, _debug=False): states, golds, _ = self.init_gold_batch([example]) if not states: diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 15b07e9b1..8b974a486 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -317,8 +317,8 @@ cdef class Parser(TrainablePipe): for multitask in self._multitasks: multitask.update(examples, drop=drop, sgd=sgd) - n_examples = len([eg for eg in examples if self.moves.has_gold(eg)]) - if n_examples == 0: + examples = [eg for eg in examples if self.moves.has_gold(eg)] + if len(examples) == 0: return losses set_dropout_rate(self.model, drop) # The probability we use beam update, instead of falling back to @@ -332,6 +332,7 @@ cdef class Parser(TrainablePipe): losses=losses, beam_density=self.cfg["beam_density"] ) + oracle_histories = [self.moves.get_oracle_sequence(eg) for eg in examples] max_moves = self.cfg["update_with_oracle_cut_size"] if max_moves >= 1: # Chop sequences into lengths of this many words, to make the @@ -339,6 +340,7 @@ cdef class Parser(TrainablePipe): max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) states, golds, _ = self._init_gold_batch( examples, + oracle_histories, max_length=max_moves ) else: @@ -370,11 +372,15 @@ cdef class Parser(TrainablePipe): if sgd not in (None, False): self.finish_update(sgd) docs = [eg.predicted for eg in examples] - # TODO: Refactor so we don't have to parse twice like this (ugh) + # If we want to set the annotations based on predictions, it's really + # hard to avoid parsing the data twice :(. # The issue is that we cut up the gold batch into sub-states, and that - # makes it hard to get the actual predicted transition sequence. - predicted_states = self.predict(docs) - self.set_annotations(docs, predicted_states) + # means there's no one predicted sequence during the update. + gold_states = [ + self.moves.follow_history(doc, history) + for doc, history in zip(docs, oracle_histories) + ] + self.set_annotations(docs, gold_states) # Ugh, this is annoying. If we're working on GPU, we want to free the # memory ASAP. It seems that Python doesn't necessarily get around to # removing these in time if we don't explicitly delete? It's confusing. @@ -581,7 +587,7 @@ cdef class Parser(TrainablePipe): raise ValueError(Errors.E149) from None return self - def _init_gold_batch(self, examples, max_length): + def _init_gold_batch(self, examples, oracle_histories, max_length): """Make a square batch, of length equal to the shortest transition sequence or a cap. A long doc will get multiple states. Let's say we have a doc of length 2*N, @@ -594,24 +600,17 @@ cdef class Parser(TrainablePipe): all_states = self.moves.init_batch([eg.predicted for eg in examples]) states = [] golds = [] - to_cut = [] - for state, eg in zip(all_states, examples): - if self.moves.has_gold(eg) and not state.is_final(): - gold = self.moves.init_gold(state, eg) - if len(eg.x) < max_length: - states.append(state) - golds.append(gold) - else: - oracle_actions = self.moves.get_oracle_sequence_from_state( - state.copy(), gold) - to_cut.append((eg, state, gold, oracle_actions)) - if not to_cut: - return states, golds, 0 - cdef int clas - for eg, state, gold, oracle_actions in to_cut: - for i in range(0, len(oracle_actions), max_length): + for state, eg, history in zip(all_states, examples, oracle_histories): + if state.is_final(): + continue + gold = self.moves.init_gold(state, eg) + if len(history) < max_length: + states.append(state) + golds.append(gold) + continue + for i in range(0, len(history), max_length): start_state = state.copy() - for clas in oracle_actions[i:i+max_length]: + for clas in history[i:i+max_length]: action = self.moves.c[clas] action.do(state.c, action.label) if state.is_final(): From a49975343e7ec0fc790b90b6a48ef1f39551cda3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 13:06:27 +1100 Subject: [PATCH 15/74] Inc version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index b0b398547..e822db0d0 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev12" +__version__ = "3.0.0rc4.dev13" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From cef93d3ae7d18f69229a0e509fac8d80dee9d87b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 13:28:57 +1100 Subject: [PATCH 16/74] Handle final states in get_oracle_sequence --- spacy/pipeline/_parser_internals/transition_system.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 61c4544e1..becaedc60 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -81,6 +81,8 @@ cdef class TransitionSystem: return self.get_oracle_sequence_from_state(state, gold) def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None): + if state.is_final(): + return [] cdef Pool mem = Pool() # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc assert self.n_moves > 0 From 3a6b93ae3ae1bf69393e4a89f8f6582140f32bc9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 13:29:08 +1100 Subject: [PATCH 17/74] Inc version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index e822db0d0..6aacb9b4d 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev13" +__version__ = "3.0.0rc4.dev14" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 456c881ae30aa46905962edeb33202ddab01fb45 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 14:40:05 +1100 Subject: [PATCH 18/74] Try to fix parser training --- .../_parser_internals/transition_system.pyx | 2 ++ spacy/pipeline/transition_parser.pyx | 15 ++++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index becaedc60..914b4123c 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -83,6 +83,8 @@ cdef class TransitionSystem: def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None): if state.is_final(): return [] + if not self.has_gold(eg): + return [] cdef Pool mem = Pool() # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc assert self.n_moves > 0 diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 8b974a486..fbc93a6d3 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -316,8 +316,9 @@ cdef class Parser(TrainablePipe): validate_examples(examples, "Parser.update") for multitask in self._multitasks: multitask.update(examples, drop=drop, sgd=sgd) - - examples = [eg for eg in examples if self.moves.has_gold(eg)] + # We need to take care to act on the whole batch, because we might be + # getting vectors via a listener. + n_examples = len([eg for eg in examples if self.moves.has_gold(eg)]) if len(examples) == 0: return losses set_dropout_rate(self.model, drop) @@ -347,7 +348,8 @@ cdef class Parser(TrainablePipe): states, golds, _ = self.moves.init_gold_batch(examples) if not states: return losses - model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples]) + docs = [eg.predicted for eg in examples] + model, backprop_tok2vec = self.model.begin_update(docs) all_states = list(states) states_golds = list(zip(states, golds)) @@ -371,7 +373,6 @@ cdef class Parser(TrainablePipe): backprop_tok2vec(golds) if sgd not in (None, False): self.finish_update(sgd) - docs = [eg.predicted for eg in examples] # If we want to set the annotations based on predictions, it's really # hard to avoid parsing the data twice :(. # The issue is that we cut up the gold batch into sub-states, and that @@ -601,7 +602,7 @@ cdef class Parser(TrainablePipe): states = [] golds = [] for state, eg, history in zip(all_states, examples, oracle_histories): - if state.is_final(): + if not history: continue gold = self.moves.init_gold(state, eg) if len(history) < max_length: @@ -609,6 +610,8 @@ cdef class Parser(TrainablePipe): golds.append(gold) continue for i in range(0, len(history), max_length): + if state.is_final(): + break start_state = state.copy() for clas in history[i:i+max_length]: action = self.moves.c[clas] @@ -618,6 +621,4 @@ cdef class Parser(TrainablePipe): if self.moves.has_gold(eg, start_state.B(0), state.B(0)): states.append(start_state) golds.append(gold) - if state.is_final(): - break return states, golds, max_length From 772248f84a90e21799543e90fcc489cd38aa832b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 14:40:31 +1100 Subject: [PATCH 19/74] Inc version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 6aacb9b4d..8a65062f7 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev14" +__version__ = "3.0.0rc4.dev15" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 19747d98d15fbaab438b7e7c2c2a927c0f865635 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 14:51:46 +1100 Subject: [PATCH 20/74] Fix --- spacy/pipeline/_parser_internals/transition_system.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 914b4123c..becaedc60 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -83,8 +83,6 @@ cdef class TransitionSystem: def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None): if state.is_final(): return [] - if not self.has_gold(eg): - return [] cdef Pool mem = Pool() # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc assert self.n_moves > 0 From 46b61972483ee5b7bbba8a50288adf510f87614e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 14:52:14 +1100 Subject: [PATCH 21/74] Inc version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 8a65062f7..afe08478f 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev15" +__version__ = "3.0.0rc4.dev16" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 38ad6c7b6af2a840352e713dc298de3be152ee95 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 15:26:43 +1100 Subject: [PATCH 22/74] Fix parser oracle --- spacy/pipeline/_parser_internals/transition_system.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index becaedc60..5bc92f161 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -70,6 +70,8 @@ cdef class TransitionSystem: return state def get_oracle_sequence(self, Example example, _debug=False): + if not self.has_gold(example): + return [] states, golds, _ = self.init_gold_batch([example]) if not states: return [] From 585ee4c81c9c5f90eba7c275215fe69a5822ea0a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 15:27:05 +1100 Subject: [PATCH 23/74] Inc version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index afe08478f..4b8766c95 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev16" +__version__ = "3.0.0rc4.dev17" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From b2044d510edd6d899a84917288f408a4955434b5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 16:21:54 +1100 Subject: [PATCH 24/74] Inc version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 4b8766c95..2831c7064 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev17" +__version__ = "3.0.0rc4.dev18" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From bd04ea0b0260012fd4524f2a07b941c05122c10f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 16:48:45 +1100 Subject: [PATCH 25/74] Fix transition has_gold --- spacy/pipeline/_parser_internals/arc_eager.pyx | 2 ++ spacy/pipeline/_parser_internals/ner.pyx | 2 ++ 2 files changed, 4 insertions(+) diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 069b41170..03cb8a4d7 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -757,6 +757,8 @@ cdef class ArcEager(TransitionSystem): return list(arcs) def has_gold(self, Example eg, start=0, end=None): + if end is not None and end < 0: + end = None for word in eg.y[start:end]: if word.dep != 0: return True diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx index d0da6ff70..a591a0ea6 100644 --- a/spacy/pipeline/_parser_internals/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -266,6 +266,8 @@ cdef class BiluoPushDown(TransitionSystem): return BiluoGold(self, state, example) def has_gold(self, Example eg, start=0, end=None): + if end is not None and end < 0: + end = None for word in eg.y[start:end]: if word.ent_iob != 0: return True From c3c462e562b36bb2e861282673d8cfd9cb7ebefb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 16:48:58 +1100 Subject: [PATCH 26/74] Inc version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 2831c7064..27216e76c 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev18" +__version__ = "3.0.0rc4.dev19" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 5b2440a1fd40c980f60da539d3ccd91c388526d8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 18:59:52 +1100 Subject: [PATCH 27/74] Try to use real histories, not oracle --- spacy/pipeline/_parser_internals/_state.pxd | 2 + .../pipeline/_parser_internals/arc_eager.pyx | 1 + .../pipeline/_parser_internals/stateclass.pyx | 4 ++ .../_parser_internals/transition_system.pyx | 3 ++ spacy/pipeline/transition_parser.pyx | 39 ++++++++++--------- 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index a6bf926f9..7f644a151 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -32,6 +32,7 @@ cdef cppclass StateC: vector[ArcC] _left_arcs vector[ArcC] _right_arcs vector[libcpp.bool] _unshiftable + vector[int] history set[int] _sent_starts TokenC _empty_token int length @@ -382,3 +383,4 @@ cdef cppclass StateC: this._b_i = src._b_i this.offset = src.offset this._empty_token = src._empty_token + this.history = src.history diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 03cb8a4d7..b477891f8 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -844,6 +844,7 @@ cdef class ArcEager(TransitionSystem): state.print_state() ))) action.do(state.c, action.label) + state.c.history.push_back(i) break else: failed = False diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx index 4eaddd997..208cf061e 100644 --- a/spacy/pipeline/_parser_internals/stateclass.pyx +++ b/spacy/pipeline/_parser_internals/stateclass.pyx @@ -20,6 +20,10 @@ cdef class StateClass: if self._borrowed != 1: del self.c + @property + def history(self): + return list(self.c.history) + @property def stack(self): return [self.S(i) for i in range(self.c.stack_depth())] diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 5bc92f161..181cffd8d 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -67,6 +67,7 @@ cdef class TransitionSystem: for clas in history: action = self.c[clas] action.do(state.c, action.label) + state.c.history.push_back(clas) return state def get_oracle_sequence(self, Example example, _debug=False): @@ -110,6 +111,7 @@ cdef class TransitionSystem: "S0 head?", str(state.has_head(state.S(0))), ))) action.do(state.c, action.label) + state.c.history.push_back(i) break else: if _debug: @@ -137,6 +139,7 @@ cdef class TransitionSystem: raise ValueError(Errors.E170.format(name=name)) action = self.lookup_transition(name) action.do(state.c, action.label) + state.c.history.push_back(action.clas) cdef Transition lookup_transition(self, object name) except *: raise NotImplementedError diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index fbc93a6d3..3c5e5e9f9 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -203,15 +203,21 @@ cdef class Parser(TrainablePipe): ) def greedy_parse(self, docs, drop=0.): - cdef vector[StateC*] states - cdef StateClass state set_dropout_rate(self.model, drop) - batch = self.moves.init_batch(docs) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to # expand our model output. self._resize() model = self.model.predict(docs) + batch = self.moves.init_batch(docs) + states = self._predict_states(model, batch) + model.clear_memory() + del model + return states + + def _predict_states(self, model, batch): + cdef vector[StateC*] states + cdef StateClass state weights = get_c_weights(model) for state in batch: if not state.is_final(): @@ -220,8 +226,6 @@ cdef class Parser(TrainablePipe): with nogil: self._parseC(&states[0], weights, sizes) - model.clear_memory() - del model return batch def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.): @@ -306,6 +310,7 @@ cdef class Parser(TrainablePipe): else: action = self.moves.c[guess] action.do(states[i], action.label) + states[i].history.push_back(guess) free(is_valid) def update(self, examples, *, drop=0., sgd=None, losses=None): @@ -319,7 +324,7 @@ cdef class Parser(TrainablePipe): # We need to take care to act on the whole batch, because we might be # getting vectors via a listener. n_examples = len([eg for eg in examples if self.moves.has_gold(eg)]) - if len(examples) == 0: + if n_examples == 0: return losses set_dropout_rate(self.model, drop) # The probability we use beam update, instead of falling back to @@ -333,7 +338,11 @@ cdef class Parser(TrainablePipe): losses=losses, beam_density=self.cfg["beam_density"] ) - oracle_histories = [self.moves.get_oracle_sequence(eg) for eg in examples] + model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples]) + final_states = self.moves.init_batch([eg.x for eg in examples]) + self._predict_states(model, final_states) + histories = [list(state.history) for state in final_states] + #oracle_histories = [self.moves.get_oracle_sequence(eg) for eg in examples] max_moves = self.cfg["update_with_oracle_cut_size"] if max_moves >= 1: # Chop sequences into lengths of this many words, to make the @@ -341,15 +350,13 @@ cdef class Parser(TrainablePipe): max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) states, golds, _ = self._init_gold_batch( examples, - oracle_histories, + histories, max_length=max_moves ) else: states, golds, _ = self.moves.init_gold_batch(examples) if not states: return losses - docs = [eg.predicted for eg in examples] - model, backprop_tok2vec = self.model.begin_update(docs) all_states = list(states) states_golds = list(zip(states, golds)) @@ -373,15 +380,7 @@ cdef class Parser(TrainablePipe): backprop_tok2vec(golds) if sgd not in (None, False): self.finish_update(sgd) - # If we want to set the annotations based on predictions, it's really - # hard to avoid parsing the data twice :(. - # The issue is that we cut up the gold batch into sub-states, and that - # means there's no one predicted sequence during the update. - gold_states = [ - self.moves.follow_history(doc, history) - for doc, history in zip(docs, oracle_histories) - ] - self.set_annotations(docs, gold_states) + self.set_annotations([eg.x for eg in examples], final_states) # Ugh, this is annoying. If we're working on GPU, we want to free the # memory ASAP. It seems that Python doesn't necessarily get around to # removing these in time if we don't explicitly delete? It's confusing. @@ -599,6 +598,7 @@ cdef class Parser(TrainablePipe): StateClass state Transition action all_states = self.moves.init_batch([eg.predicted for eg in examples]) + assert len(all_states) == len(examples) == len(oracle_histories) states = [] golds = [] for state, eg, history in zip(all_states, examples, oracle_histories): @@ -616,6 +616,7 @@ cdef class Parser(TrainablePipe): for clas in history[i:i+max_length]: action = self.moves.c[clas] action.do(state.c, action.label) + state.c.history.push_back(clas) if state.is_final(): break if self.moves.has_gold(eg, start_state.B(0), state.B(0)): From af0b3bc4d8ac11c3cbff98f0235c7df647e30dc5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 19:02:27 +1100 Subject: [PATCH 28/74] Inc version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 27216e76c..67a6271e6 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev19" +__version__ = "3.0.0rc4.dev20" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 70bcc1f48e7f83e1cbfb853a38de9ed025d3eccd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 19:17:41 +1100 Subject: [PATCH 29/74] Upd parser --- spacy/pipeline/transition_parser.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 3c5e5e9f9..36588f5e8 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -604,6 +604,8 @@ cdef class Parser(TrainablePipe): for state, eg, history in zip(all_states, examples, oracle_histories): if not history: continue + if not self.moves.has_gold(eg): + continue gold = self.moves.init_gold(state, eg) if len(history) < max_length: states.append(state) From cda3b08dd1a39ac618b39d3a6cf30721d95d2871 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 19:17:57 +1100 Subject: [PATCH 30/74] Inc version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 67a6271e6..5eaf3c224 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0rc4.dev20" +__version__ = "3.0.0rc4.dev21" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From b456929bfde8f3f10441c030813bc2ff5fb1c1e0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 23:20:30 +1100 Subject: [PATCH 31/74] WIP on rewrite parser --- spacy/pipeline/_parser_internals/ner.pyx | 12 +- spacy/pipeline/transition_parser.pyx | 176 +++++++++-------------- 2 files changed, 70 insertions(+), 118 deletions(-) diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx index a591a0ea6..e4e95695c 100644 --- a/spacy/pipeline/_parser_internals/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -350,9 +350,9 @@ cdef class Begin: elif st.B_(1).ent_iob == 3: # If the next word is B, we can't B now return False - elif st.B_(1).sent_start == 1: - # Don't allow entities to extend across sentence boundaries - return False + #elif st.B_(1).sent_start == 1: + # # Don't allow entities to extend across sentence boundaries + # return False # Don't allow entities to start on whitespace elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE): return False @@ -418,9 +418,9 @@ cdef class In: # Otherwise, force acceptance, even if we're across a sentence # boundary or the token is whitespace. return True - elif st.B(1) != -1 and st.B_(1).sent_start == 1: - # Don't allow entities to extend across sentence boundaries - return False + #elif st.B(1) != -1 and st.B_(1).sent_start == 1: + # # Don't allow entities to extend across sentence boundaries + # return False else: return True diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 36588f5e8..206b82ef7 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -10,7 +10,7 @@ import random from typing import Optional import srsly -from thinc.api import set_dropout_rate, CupyOps +from thinc.api import set_dropout_rate, CupyOps, get_array_module from thinc.extra.search cimport Beam import numpy.random import numpy @@ -338,58 +338,79 @@ cdef class Parser(TrainablePipe): losses=losses, beam_density=self.cfg["beam_density"] ) - model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples]) - final_states = self.moves.init_batch([eg.x for eg in examples]) - self._predict_states(model, final_states) - histories = [list(state.history) for state in final_states] - #oracle_histories = [self.moves.get_oracle_sequence(eg) for eg in examples] - max_moves = self.cfg["update_with_oracle_cut_size"] - if max_moves >= 1: - # Chop sequences into lengths of this many words, to make the - # batch uniform length. - max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) - states, golds, _ = self._init_gold_batch( - examples, - histories, - max_length=max_moves - ) - else: - states, golds, _ = self.moves.init_gold_batch(examples) - if not states: - return losses - - all_states = list(states) - states_golds = list(zip(states, golds)) - n_moves = 0 - while states_golds: - states, golds = zip(*states_golds) - scores, backprop = model.begin_update(states) - d_scores = self.get_batch_loss(states, golds, scores, losses) - # Note that the gradient isn't normalized by the batch size - # here, because our "samples" are really the states...But we - # can't normalize by the number of states either, as then we'd - # be getting smaller gradients for states in long sequences. - backprop(d_scores) - # Follow the predicted action - self.transition_states(states, scores) - states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()] - if max_moves >= 1 and n_moves >= max_moves: - break - n_moves += 1 - - backprop_tok2vec(golds) + docs = [eg.x for eg in examples] + model, backprop_tok2vec = self.model.begin_update(docs) + states = self.moves.init_batch(docs) + self._predict_states(states) + # I've separated the prediction from getting the batch because + # I like the idea of trying to store the histories or maybe compute + # them in another process or something. Just walking the states + # and transitioning isn't expensive anyway. + ids, costs = self._get_ids_and_costs_from_histories( + examples, + [list(state.history) for state in states] + ) + scores, backprop_states = model.begin_update(ids) + d_scores = self.get_loss(scores, costs) + d_tokvecs = backprop_states(d_scores) + backprop_tok2vec(d_tokvecs) if sgd not in (None, False): self.finish_update(sgd) - self.set_annotations([eg.x for eg in examples], final_states) + self.set_annotations(docs, states) + losses[self.name] += (d_scores**2).sum() # Ugh, this is annoying. If we're working on GPU, we want to free the # memory ASAP. It seems that Python doesn't necessarily get around to # removing these in time if we don't explicitly delete? It's confusing. - del backprop + del backprop_states del backprop_tok2vec model.clear_memory() del model return losses + def _get_ids_and_costs_from_histories(self, examples, histories): + cdef StateClass state + cdef int clas + cdef int nF = self.model.state2vec.nF + cdef int nO = self.moves.n_moves + cdef int nS = sum([len(history) for history in histories]) + # ids and costs have one row per state in the whole batch. + cdef np.ndarray ids = numpy.zeros((nS, nF), dtype="i") + cdef np.ndarray costs = numpy.zeros((nS, nO), dtype="f") + cdef Pool mem = Pool() + is_valid = mem.alloc(nO, sizeof(int)) + c_ids = ids.data + c_costs = costs.data + states = self.moves.init_states([eg.x for eg in examples]) + cdef int i = 0 + for eg, state, history in zip(examples, states, histories): + gold = self.moves.init_gold(state, eg) + for clas in history: + # Set a row into the C data of the arrays (which we return) + state.c.set_context_tokens(&c_ids[i*nF], nF) + self.moves.set_costs(is_valid, &c_costs[i*nO], state.c, gold) + action = self.moves.c[clas] + action.do(state.c, action.label) + state.c.history.push_back(clas) + i += 1 + # If the model is on GPU, copy the costs to device. + costs = self.model.ops.asarray(costs) + return ids, costs + + def get_loss(self, scores, costs): + xp = get_array_module(scores) + best_costs = costs.min(axis=1, keepdims=True) + is_gold = costs <= costs.min(axis=1, keepdims=True) + gscores = scores[is_gold] + max_ = scores.max(axis=1) + gmax = gscores.max(axis=1, keepdims=True) + exp_scores = xp.exp(scores - max_) + exp_gscores = xp.exp(gscores - gmax) + Z = exp_scores.sum(axis=1, keepdims=True) + gZ = exp_gscores.sum(axis=1, keepdims=True) + d_scores = exp_scores / Z + d_scores[is_gold] -= exp_gscores / gZ + return d_scores + def rehearse(self, examples, sgd=None, losses=None, **cfg): """Perform a "rehearsal" update, to prevent catastrophic forgetting.""" if losses is None: @@ -460,36 +481,6 @@ cdef class Parser(TrainablePipe): if sgd is not None: self.finish_update(sgd) - def get_batch_loss(self, states, golds, float[:, ::1] scores, losses): - cdef StateClass state - cdef Pool mem = Pool() - cdef int i - - # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc - assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) - - is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) - costs = mem.alloc(self.moves.n_moves, sizeof(float)) - cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves), - dtype='f', order='C') - c_d_scores = d_scores.data - unseen_classes = self.model.attrs["unseen_classes"] - for i, (state, gold) in enumerate(zip(states, golds)): - memset(is_valid, 0, self.moves.n_moves * sizeof(int)) - memset(costs, 0, self.moves.n_moves * sizeof(float)) - self.moves.set_costs(is_valid, costs, state.c, gold) - for j in range(self.moves.n_moves): - if costs[j] <= 0.0 and j in unseen_classes: - unseen_classes.remove(j) - cpu_log_loss(c_d_scores, - costs, is_valid, &scores[i, 0], d_scores.shape[1]) - c_d_scores += d_scores.shape[1] - # Note that we don't normalize this. See comment in update() for why. - if losses is not None: - losses.setdefault(self.name, 0.) - losses[self.name] += (d_scores**2).sum() - return d_scores - def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) @@ -586,42 +577,3 @@ cdef class Parser(TrainablePipe): except AttributeError: raise ValueError(Errors.E149) from None return self - - def _init_gold_batch(self, examples, oracle_histories, max_length): - """Make a square batch, of length equal to the shortest transition - sequence or a cap. A long - doc will get multiple states. Let's say we have a doc of length 2*N, - where N is the shortest doc. We'll make two states, one representing - long_doc[:N], and another representing long_doc[N:].""" - cdef: - StateClass start_state - StateClass state - Transition action - all_states = self.moves.init_batch([eg.predicted for eg in examples]) - assert len(all_states) == len(examples) == len(oracle_histories) - states = [] - golds = [] - for state, eg, history in zip(all_states, examples, oracle_histories): - if not history: - continue - if not self.moves.has_gold(eg): - continue - gold = self.moves.init_gold(state, eg) - if len(history) < max_length: - states.append(state) - golds.append(gold) - continue - for i in range(0, len(history), max_length): - if state.is_final(): - break - start_state = state.copy() - for clas in history[i:i+max_length]: - action = self.moves.c[clas] - action.do(state.c, action.label) - state.c.history.push_back(clas) - if state.is_final(): - break - if self.moves.has_gold(eg, start_state.B(0), state.B(0)): - states.append(start_state) - golds.append(gold) - return states, golds, max_length From 267ffb560560507eb7b6bba8225e719592097ca6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Jan 2021 23:22:10 +1100 Subject: [PATCH 32/74] WIP refactor parser --- spacy/ml/parser_model.pyx | 204 +++++++++++++------------------------- 1 file changed, 71 insertions(+), 133 deletions(-) diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx index da937ca4f..cef9b6fc9 100644 --- a/spacy/ml/parser_model.pyx +++ b/spacy/ml/parser_model.pyx @@ -18,8 +18,9 @@ from ..pipeline._parser_internals.stateclass cimport StateClass cdef WeightsC get_c_weights(model) except *: cdef WeightsC output cdef precompute_hiddens state2vec = model.state2vec + cdef np.ndarray bias = state2vec.bias output.feat_weights = state2vec.get_feat_weights() - output.feat_bias = state2vec.bias.data + output.feat_bias = bias.data cdef np.ndarray vec2scores_W cdef np.ndarray vec2scores_b if model.vec2scores is None: @@ -220,27 +221,23 @@ class ParserStepModel(Model): activation = None else: activation = "relu" - self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1], - activation=activation, train=train) + self.state2vec = precompute_hiddens( + len(docs), + self.tokvecs, + layers[1], + activation=activation, + train=train + ) if has_upper: self.vec2scores = layers[-1] else: self.vec2scores = None - self.cuda_stream = util.get_cuda_stream(non_blocking=True) - self.backprops = [] self._class_mask = numpy.zeros((self.nO,), dtype='f') self._class_mask.fill(1) if unseen_classes is not None: for class_ in unseen_classes: self._class_mask[class_] = 0. - def clear_memory(self): - del self.tokvecs - del self.bp_tokvecs - del self.state2vec - del self.backprops - del self._class_mask - @property def nO(self): if self.attrs["has_upper"]: @@ -248,6 +245,13 @@ class ParserStepModel(Model): else: return self.state2vec.get_dim("nO") + def clear_memory(self): + del self.tokvecs + del self.bp_tokvecs + del self.state2vec + del self.backprops + del self._class_mask + def class_is_unseen(self, class_): return self._class_mask[class_] @@ -269,54 +273,22 @@ class ParserStepModel(Model): c_ids += ids.shape[1] return ids - def backprop_step(self, token_ids, d_vector, get_d_tokvecs): - if isinstance(self.state2vec.ops, CupyOps) \ - and not isinstance(token_ids, self.state2vec.ops.xp.ndarray): - # Move token_ids and d_vector to GPU, asynchronously - self.backprops.append(( - util.get_async(self.cuda_stream, token_ids), - util.get_async(self.cuda_stream, d_vector), - get_d_tokvecs - )) - else: - self.backprops.append((token_ids, d_vector, get_d_tokvecs)) - - def finish_steps(self, golds): - # Add a padding vector to the d_tokvecs gradient, so that missing - # values don't affect the real gradient. - d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1])) - # Tells CUDA to block, so our async copies complete. - if self.cuda_stream is not None: - self.cuda_stream.synchronize() - for ids, d_vector, bp_vector in self.backprops: - d_state_features = bp_vector((d_vector, ids)) - ids = ids.flatten() - d_state_features = d_state_features.reshape( - (ids.size, d_state_features.shape[2])) - self.ops.scatter_add(d_tokvecs, ids, - d_state_features) - # Padded -- see update() - self.bp_tokvecs(d_tokvecs[:-1]) - return d_tokvecs - -NUMPY_OPS = NumpyOps() - -def step_forward(model: ParserStepModel, states, is_train): - token_ids = model.get_token_ids(states) +def step_forward(model: ParserStepModel, token_ids, is_train): vector, get_d_tokvecs = model.state2vec(token_ids, is_train) mask = None if model.attrs["has_upper"]: + vec2scores = ensure_same_device(model.ops, model.vec2scores) dropout_rate = model.attrs["dropout_rate"] if is_train and dropout_rate > 0: - mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1) + mask = model.ops.get_dropout_mask(vector.shape, dropout_rate) vector *= mask - scores, get_d_vector = model.vec2scores(vector, is_train) + scores, get_d_vector = vec2scores(vector, is_train) else: - scores = NumpyOps().asarray(vector) + scores = vector get_d_vector = lambda d_scores: d_scores # If the class is unseen, make sure its score is minimum - scores[:, model._class_mask == 0] = numpy.nanmin(scores) + scores[:, model._class_mask == 0] = model.ops.xp.nanmin(scores) def backprop_parser_step(d_scores): # Zero vectors for unseen classes @@ -324,11 +296,18 @@ def step_forward(model: ParserStepModel, states, is_train): d_vector = get_d_vector(d_scores) if mask is not None: d_vector *= mask - model.backprop_step(token_ids, d_vector, get_d_tokvecs) - return None + return get_d_tokvecs(d_vector) + return scores, backprop_parser_step +def ensure_same_device(ops, model): + """Ensure a model is on the same device as a given ops""" + if not isinstance(model.ops, ops.__class__): + model._to_ops(ops) + return model + + cdef class precompute_hiddens: """Allow a model to be "primed" by pre-computing input features in bulk. @@ -347,31 +326,23 @@ cdef class precompute_hiddens: and do the hard-to-program parsing on the CPU. """ cdef readonly int nF, nO, nP - cdef bint _is_synchronized cdef public object ops - cdef public object numpy_ops - cdef np.ndarray _features - cdef np.ndarray _cached - cdef np.ndarray bias - cdef object _cuda_stream - cdef object _bp_hiddens - cdef object activation + cdef readonly object bias + cdef readonly object activation + cdef readonly object _features + cdef readonly object _cached + cdef readonly object _bp_hiddens - def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, - activation="maxout", train=False): - gpu_cached, bp_features = lower_model(tokvecs, train) - cdef np.ndarray cached - if not isinstance(gpu_cached, numpy.ndarray): - # Note the passing of cuda_stream here: it lets - # cupy make the copy asynchronously. - # We then have to block before first use. - cached = gpu_cached.get(stream=cuda_stream) - else: - cached = gpu_cached - if not isinstance(lower_model.get_param("b"), numpy.ndarray): - self.bias = lower_model.get_param("b").get(stream=cuda_stream) - else: - self.bias = lower_model.get_param("b") + def __init__( + self, + batch_size, + tokvecs, + lower_model, + activation="maxout", + train=False + ): + cached, bp_features = lower_model(tokvecs, train) + self.bias = lower_model.get_param("b") self.nF = cached.shape[1] if lower_model.has_dim("nP"): self.nP = lower_model.get_dim("nP") @@ -379,19 +350,18 @@ cdef class precompute_hiddens: self.nP = 1 self.nO = cached.shape[2] self.ops = lower_model.ops - self.numpy_ops = NumpyOps() assert activation in (None, "relu", "maxout") self.activation = activation - self._is_synchronized = False - self._cuda_stream = cuda_stream self._cached = cached self._bp_hiddens = bp_features cdef const float* get_feat_weights(self) except NULL: - if not self._is_synchronized and self._cuda_stream is not None: - self._cuda_stream.synchronize() - self._is_synchronized = True - return self._cached.data + cdef np.ndarray cached + if isinstance(self._cached, numpy.ndarray): + cached = self._cached + else: + cached = self._cached.get() + return cached.data def has_dim(self, name): if name == "nF": @@ -433,57 +403,25 @@ cdef class precompute_hiddens: return self.begin_update(X)[0] def begin_update(self, token_ids): - cdef np.ndarray state_vector = numpy.zeros( - (token_ids.shape[0], self.nO, self.nP), dtype='f') - # This is tricky, but (assuming GPU available); - # - Input to forward on CPU - # - Output from forward on CPU - # - Input to backward on GPU! - # - Output from backward on GPU + nO = self.nO + nP = self.nP + hidden = self.model.ops.alloc2f( + token_ids.shape[0], + nO * nP + ) bp_hiddens = self._bp_hiddens + feat_weights = self.cached + self.ops.scatter_add( + hidden, + feat_weights, + token_ids + ) + hidden += self.bias + statevec, mask = self.ops.maxout(hidden.reshape((-1, nO, nP))) - feat_weights = self.get_feat_weights() - cdef int[:, ::1] ids = token_ids - sum_state_features(state_vector.data, - feat_weights, &ids[0,0], - token_ids.shape[0], self.nF, self.nO*self.nP) - state_vector += self.bias - state_vector, bp_nonlinearity = self._nonlinearity(state_vector) - - def backward(d_state_vector_ids): - d_state_vector, token_ids = d_state_vector_ids - d_state_vector = bp_nonlinearity(d_state_vector) - d_tokens = bp_hiddens((d_state_vector, token_ids)) - return d_tokens - return state_vector, backward - - def _nonlinearity(self, state_vector): - if self.activation == "maxout": - return self._maxout_nonlinearity(state_vector) - else: - return self._relu_nonlinearity(state_vector) - - def _maxout_nonlinearity(self, state_vector): - state_vector, mask = self.numpy_ops.maxout(state_vector) - # We're outputting to CPU, but we need this variable on GPU for the - # backward pass. - mask = self.ops.asarray(mask) - - def backprop_maxout(d_best): - return self.ops.backprop_maxout(d_best, mask, self.nP) + def backward(d_statevec): + return bp_hiddens( + self.ops.backprop_maxout(d_statevec, mask, nP) + ) - return state_vector, backprop_maxout - - def _relu_nonlinearity(self, state_vector): - state_vector = state_vector.reshape((state_vector.shape[0], -1)) - mask = state_vector >= 0. - state_vector *= mask - # We're outputting to CPU, but we need this variable on GPU for the - # backward pass. - mask = self.ops.asarray(mask) - - def backprop_relu(d_best): - d_best *= mask - return d_best.reshape((d_best.shape + (1,))) - - return state_vector, backprop_relu + return statevec, backward From de8c88babb650631bc50a813aceeda32f09c58fa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 03:13:31 +0200 Subject: [PATCH 33/74] New progress on parser model refactor --- spacy/ml/parser_model.pyx | 256 +++++++-------------- spacy/ml/tb_framework.py | 328 ++++++++++++++++++++++++--- spacy/pipeline/transition_parser.pyx | 236 +++---------------- 3 files changed, 410 insertions(+), 410 deletions(-) diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx index cef9b6fc9..72140401b 100644 --- a/spacy/ml/parser_model.pyx +++ b/spacy/ml/parser_model.pyx @@ -208,50 +208,41 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no -class ParserStepModel(Model): - def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True, - dropout=0.1): - Model.__init__(self, name="parser_step_model", forward=step_forward) - self.attrs["has_upper"] = has_upper - self.attrs["dropout_rate"] = dropout - self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train) - if layers[1].get_dim("nP") >= 2: - activation = "maxout" - elif has_upper: - activation = None - else: - activation = "relu" - self.state2vec = precompute_hiddens( - len(docs), - self.tokvecs, - layers[1], - activation=activation, - train=train - ) - if has_upper: - self.vec2scores = layers[-1] - else: - self.vec2scores = None - self._class_mask = numpy.zeros((self.nO,), dtype='f') - self._class_mask.fill(1) - if unseen_classes is not None: - for class_ in unseen_classes: - self._class_mask[class_] = 0. +def ParserStepModel( + tokvecs: Floats2d, + bp_tokvecs: Callable, + upper: Model[Floats2d, Floats2d], + dropout: float=0.1 + unseen_classes: Optional[List[int]]=None +) -> Model[Ints2d, Floats2d]: + # TODO: Keep working on replacing all of this with just 'chain' + state2vec = precompute_hiddens( + tokvecs, + bp_tokvecs + ) + class_mask = numpy.zeros((self.nO,), dtype='f') + class_mask.fill(1) + if unseen_classes is not None: + for class_ in unseen_classes: + class_mask[class_] = 0. - @property - def nO(self): - if self.attrs["has_upper"]: - return self.vec2scores.get_dim("nO") - else: - return self.state2vec.get_dim("nO") + return _ParserStepModel( + "ParserStep", + step_forward, + init=None, + dims={"nO": upper.get_dim("nO")}, + layers=[state2vec, upper], + attrs={ + "tokvecs": tokvecs, + "bp_tokvecs": bp_tokvecs, + "dropout_rate": dropout, + "class_mask": class_mask + } + ) - def clear_memory(self): - del self.tokvecs - del self.bp_tokvecs - del self.state2vec - del self.backprops - del self._class_mask +class _ParserStepModel(Model): + # TODO: Remove need for all this stuff, so we can normalize this def class_is_unseen(self, class_): return self._class_mask[class_] @@ -274,21 +265,22 @@ class ParserStepModel(Model): return ids -def step_forward(model: ParserStepModel, token_ids, is_train): - vector, get_d_tokvecs = model.state2vec(token_ids, is_train) +def step_forward(model: _ParserStepModel, token_ids, is_train): + # TODO: Eventually we hopefully can get rid of all of this? + # If we make the 'class_mask' thing its own layer, we can just + # have chain() here, right? + state2vec, upper = model.layers + vector, get_d_tokvecs = state2vec(token_ids, is_train) mask = None - if model.attrs["has_upper"]: - vec2scores = ensure_same_device(model.ops, model.vec2scores) - dropout_rate = model.attrs["dropout_rate"] - if is_train and dropout_rate > 0: - mask = model.ops.get_dropout_mask(vector.shape, dropout_rate) - vector *= mask - scores, get_d_vector = vec2scores(vector, is_train) - else: - scores = vector - get_d_vector = lambda d_scores: d_scores + vec2scores = ensure_same_device(model.ops, vec2scores) + dropout_rate = model.attrs["dropout_rate"] + if is_train and dropout_rate > 0: + mask = model.ops.get_dropout_mask(vector.shape, dropout_rate) + vector *= mask + scores, get_d_vector = vec2scores(vector, is_train) # If the class is unseen, make sure its score is minimum - scores[:, model._class_mask == 0] = model.ops.xp.nanmin(scores) + class_mask = model.attrs["class_mask"] + scores[:, class_mask == 0] = model.ops.xp.nanmin(scores) def backprop_parser_step(d_scores): # Zero vectors for unseen classes @@ -301,127 +293,45 @@ def step_forward(model: ParserStepModel, token_ids, is_train): return scores, backprop_parser_step -def ensure_same_device(ops, model): - """Ensure a model is on the same device as a given ops""" - if not isinstance(model.ops, ops.__class__): - model._to_ops(ops) - return model +def precompute_hiddens(lower_model, feat_weights: Floats3d, bp_hiddens: Callable) -> Model: + return Model( + "precompute_hiddens", + init=None, + forward=_precompute_forward, + dims={ + "nO": feat_weights.shape[2], + "nP": lower_model.get_dim("nP") if lower_model.has_dim("nP") else 1, + "nF": cached.shape[1] + }, + ops=lower_model.ops + ) -cdef class precompute_hiddens: - """Allow a model to be "primed" by pre-computing input features in bulk. +def _precomputed_forward( + model: Model[Ints2d, Floats2d], + token_ids: Ints2d, + is_train: bool +) -> Tuple[Floats2d, Callable]: + nO = model.get_dim("nO") + nP = model.get_dim("nP") + bp_hiddens = model.attrs["bp_hiddens"] + feat_weights = model.attrs["feat_weights"] + bias = model.attrs["bias"] + hidden = model.ops.alloc2f( + token_ids.shape[0], + nO * nP + ) + # TODO: This is probably wrong, right? + model.ops.scatter_add( + hidden, + feat_weights, + token_ids + ) + statevec, mask = model.ops.maxout(hidden.reshape((-1, nO, nP))) - This is used for the parser, where we want to take a batch of documents, - and compute vectors for each (token, position) pair. These vectors can then - be reused, especially for beam-search. - - Let's say we're using 12 features for each state, e.g. word at start of - buffer, three words on stack, their children, etc. In the normal arc-eager - system, a document of length N is processed in 2*N states. This means we'll - create 2*N*12 feature vectors --- but if we pre-compute, we only need - N*12 vector computations. The saving for beam-search is much better: - if we have a beam of k, we'll normally make 2*N*12*K computations -- - so we can save the factor k. This also gives a nice CPU/GPU division: - we can do all our hard maths up front, packed into large multiplications, - and do the hard-to-program parsing on the CPU. - """ - cdef readonly int nF, nO, nP - cdef public object ops - cdef readonly object bias - cdef readonly object activation - cdef readonly object _features - cdef readonly object _cached - cdef readonly object _bp_hiddens - - def __init__( - self, - batch_size, - tokvecs, - lower_model, - activation="maxout", - train=False - ): - cached, bp_features = lower_model(tokvecs, train) - self.bias = lower_model.get_param("b") - self.nF = cached.shape[1] - if lower_model.has_dim("nP"): - self.nP = lower_model.get_dim("nP") - else: - self.nP = 1 - self.nO = cached.shape[2] - self.ops = lower_model.ops - assert activation in (None, "relu", "maxout") - self.activation = activation - self._cached = cached - self._bp_hiddens = bp_features - - cdef const float* get_feat_weights(self) except NULL: - cdef np.ndarray cached - if isinstance(self._cached, numpy.ndarray): - cached = self._cached - else: - cached = self._cached.get() - return cached.data - - def has_dim(self, name): - if name == "nF": - return self.nF if self.nF is not None else True - elif name == "nP": - return self.nP if self.nP is not None else True - elif name == "nO": - return self.nO if self.nO is not None else True - else: - return False - - def get_dim(self, name): - if name == "nF": - return self.nF - elif name == "nP": - return self.nP - elif name == "nO": - return self.nO - else: - raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP") - - def set_dim(self, name, value): - if name == "nF": - self.nF = value - elif name == "nP": - self.nP = value - elif name == "nO": - self.nO = value - else: - raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP") - - def __call__(self, X, bint is_train): - if is_train: - return self.begin_update(X) - else: - return self.predict(X), lambda X: X - - def predict(self, X): - return self.begin_update(X)[0] - - def begin_update(self, token_ids): - nO = self.nO - nP = self.nP - hidden = self.model.ops.alloc2f( - token_ids.shape[0], - nO * nP - ) - bp_hiddens = self._bp_hiddens - feat_weights = self.cached - self.ops.scatter_add( - hidden, - feat_weights, - token_ids + def backward(d_statevec): + return bp_hiddens( + model.ops.backprop_maxout(d_statevec, mask, nP) ) - hidden += self.bias - statevec, mask = self.ops.maxout(hidden.reshape((-1, nO, nP))) - - def backward(d_statevec): - return bp_hiddens( - self.ops.backprop_maxout(d_statevec, mask, nP) - ) - return statevec, backward + return statevec, backward diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 4ab5830cd..1e14d239e 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,48 +1,314 @@ -from thinc.api import Model, noop -from .parser_model import ParserStepModel +from typing import List, Tuple, Any, Optional +from thinc.api import Ops, Model, normal_init +from thinc.types import Floats1d, Floats2d, Floats3d, Ints2d, Floats4d +from ..tokens.doc import Doc + + +TransitionSystem = Any # TODO +State = Any # TODO def TransitionModel( - tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set() -): - """Set up a stepwise transition-based model""" - if upper is None: - has_upper = False - upper = noop() - else: - has_upper = True - # don't define nO for this object, because we can't dynamically change it + *, + tok2vec: Model[List[Doc], List[Floats2d]], + state_tokens: int, + hidden_width: int, + maxout_pieces: int, + nO: Optional[int] = None, + unseen_classes=set(), +) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]: + """Set up a transition-based parsing model, using a maxout hidden + layer and a linear output layer. + """ return Model( name="parser_model", forward=forward, - dims={"nI": tok2vec.get_dim("nI") if tok2vec.has_dim("nI") else None}, - layers=[tok2vec, lower, upper], - refs={"tok2vec": tok2vec, "lower": lower, "upper": upper}, init=init, + layers=[tok2vec], + refs={"tok2vec": tok2vec}, + params={ + "lower_W": None, # Floats2d W for the hidden layer + "lower_b": None, # Floats1d bias for the hidden layer + "lower_pad": None, # Floats1d bias for the hidden layer + "upper_W": None, # Floats2d W for the output layer + "upper_b": None, # Floats1d bias for the output layer + }, + dims={ + "nO": None, # Output size + "nP": maxout_pieces, + "nH": hidden_width, + "nI": tok2vec.maybe_get_dim("nO"), + "nF": state_tokens, + }, attrs={ - "has_upper": has_upper, "unseen_classes": set(unseen_classes), "resize_output": resize_output, + "make_step_model": make_step_model, }, ) -def forward(model, X, is_train): - step_model = ParserStepModel( - X, - model.layers, - unseen_classes=model.attrs["unseen_classes"], - train=is_train, - has_upper=model.attrs["has_upper"], +def make_step_model(model: Model) -> Model[List[State], Floats2d]: + ... + + +def resize_output(model: Model) -> Model: + ... + + +def init( + model, + X: Optional[Tuple[List[Doc], TransitionSystem]] = None, + Y: Optional[Tuple[List[State], List[Floats2d]]] = None, +): + if X is not None: + docs, states = X + model.get_ref("tok2vec").initialize(X=docs) + inferred_nO = _infer_nO(Y) + if inferred_nO is not None: + current_nO = model.maybe_get_dim("nO") + if current_nO is None: + model.set_dim("nO", inferred_nO) + elif current_nO != inferred_nO: + model.attrs["resize_output"](model, inferred_nO) + nO = model.get_dim("nO") + nP = model.get_dim("nP") + nH = model.get_dim("nH") + nI = model.get_dim("nI") + nF = model.get_dim("nF") + ops = model.ops + + Wl = ops.alloc4f(nF, nH, nP, nI) + bl = ops.alloc2f(nH, nP) + padl = ops.alloc4f(1, nF, nH, nP) + Wu = ops.alloc2f(nO, nH) + bu = ops.alloc1f(nO) + Wl = normal_init(ops, Wl.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI))) + padl = normal_init(ops, padl.shape, mean=1.0) + # TODO: Experiment with whether better to initialize Wu + model.set_param("lower_W", Wl) + model.set_param("lower_b", bl) + model.set_param("lower_pad", padl) + model.set_param("upper_W", Wu) + model.set_param("upper_b", bu) + + _lsuv_init(model) + + +def forward(model, docs_moves, is_train): + tok2vec = model.get_ref("tok2vec") + state2scores = model.get_ref("state2scores") + # Get a reference to the parameters. We need to work with + # stable references through the forward/backward pass, to make + # sure we don't have a stale reference if there's concurrent shenanigans. + params = {name: model.get_param(name) for name in model.param_names} + ops = model.ops + docs, moves = docs_moves + states = moves.init_batch(docs) + tokvecs, backprop_tok2vec = tok2vec(docs, is_train) + feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train) + memory = [] + all_scores = [] + while states: + states, scores, memory = _step_parser( + ops, params, moves, states, feats, memory, is_train + ) + all_scores.append(scores) + + def backprop_parser(d_states_d_scores): + _, d_scores = d_states_d_scores + d_feats, ids = _backprop_parser_steps(ops, params, memory, d_scores) + d_tokvecs = backprop_feats((d_feats, ids)) + return backprop_tok2vec(d_tokvecs), None + + return (states, all_scores), backprop_parser + + +def _step_parser(ops, params, moves, states, feats, memory, is_train): + ids = moves.get_state_ids(states) + statevecs, which, scores = _score_ids(ops, params, ids, feats, is_train) + next_states = moves.transition_states(states, scores) + if is_train: + memory.append((ids, statevecs, which)) + return next_states, scores, memory + + +def _score_ids(ops, params, ids, feats, is_train): + lower_pad = params["lower_pad"] + lower_b = params["lower_b"] + upper_W = params["upper_W"] + upper_b = params["upper_b"] + # During each step of the parser, we do: + # * Index into the features, to get the pre-activated vector + # for each (token, feature) and sum the feature vectors + preacts = _sum_state_features(feats, lower_pad, ids) + # * Add the bias + preacts += lower_b + # * Apply the activation (maxout) + statevecs, which = ops.maxout(preacts) + # * Multiply the state-vector by the scores weights + scores = ops.gemm(statevecs, upper_W, trans2=True) + # * Add the bias + scores += upper_b + # * Apply the is-class-unseen masking + # TODO + return statevecs, which, scores + + +def _sum_state_features(ops: Ops, feats: Floats3d, ids: Ints2d) -> Floats2d: + # Here's what we're trying to implement here: + # + # for i in range(ids.shape[0]): + # for j in range(ids.shape[1]): + # output[i] += feats[ids[i, j], j] + # + # Reshape the feats into 2d, to make indexing easier. Instead of getting an + # array of indices where the cell at (4, 2) needs to refer to the row at + # feats[4, 2], we'll translate the index so that it directly addresses + # feats[18]. This lets us make the indices array 1d, leading to fewer + # numpy shennanigans. + feats2d = ops.reshape2f(feats, feats.shape[0] * feats.shape[1], feats.shape[2]) + # Now translate the ids. If we're looking for the row that used to be at + # (4, 1) and we have 4 features, we'll find it at (4*4)+1=17. + oob_ids = ids < 0 # Retain the -1 values + ids = ids * feats.shape[1] + ops.xp.arange(feats.shape[1]) + ids[oob_ids] = -1 + unsummed2d = feats2d[ops.reshape1i(ids, ids.size)] + unsummed3d = ops.reshape3f( + unsummed2d, feats.shape[0], feats.shape[1], feats.shape[2] ) - - return step_model, step_model.finish_steps + summed = unsummed3d.sum(axis=1) # type: ignore + return summed -def init(model, X=None, Y=None): - model.get_ref("tok2vec").initialize(X=X) - lower = model.get_ref("lower") - lower.initialize() - if model.attrs["has_upper"]: - statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) - model.get_ref("upper").initialize(X=statevecs) +def _process_memory(ops, memory): + """Concatenate the memory buffers from each state into contiguous + buffers for the whole batch. + """ + return [ops.xp.concatenate(*item) for item in zip(*memory)] + + +def _backprop_parser_steps(model, upper_W, memory, d_scores): + # During each step of the parser, we do: + # * Index into the features, to get the pre-activated vector + # for each (token, feature) + # * Sum the feature vectors + # * Add the bias + # * Apply the activation (maxout) + # * Multiply the state-vector by the scores weights + # * Add the bias + # * Apply the is-class-unseen masking + # + # So we have to backprop through all those steps. + ids, statevecs, whiches = _process_memory(model.ops, memory) + # TODO: Unseen class masking + # Calculate the gradients for the parameters of the upper layer. + model.inc_grad("upper_b", d_scores.sum(axis=0)) + model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True)) + # Now calculate d_statevecs, by backproping through the upper linear layer. + d_statevecs = model.ops.gemm(d_scores, upper_W) + # Backprop through the maxout activation + d_preacts = model.ops.backprop_maxount(d_statevecs, whiches, model.get_dim("nP")) + # We don't need to backprop the summation, because we pass back the IDs instead + return d_preacts, ids + + +def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): + + W: Floats4d = model.get_param("lower_W") + b: Floats2d = model.get_param("lower_b") + pad: Floats4d = model.get_param("lower_pad") + nF = model.get_dim("nF") + nO = model.get_dim("nO") + nP = model.get_dim("nP") + nI = model.get_dim("nI") + Yf_ = model.ops.gemm(X, model.ops.reshape2f(W, nF * nO * nP, nI), trans2=True) + Yf = model.ops.reshape4f(Yf_, Yf_.shape[0], nF, nO, nP) + Yf = model.ops.xp.vstack((Yf, pad)) + + def backward(dY_ids: Tuple[Floats3d, Ints2d]): + # This backprop is particularly tricky, because we get back a different + # thing from what we put out. We put out an array of shape: + # (nB, nF, nO, nP), and get back: + # (nB, nO, nP) and ids (nB, nF) + # The ids tell us the values of nF, so we would have: + # + # dYf = zeros((nB, nF, nO, nP)) + # for b in range(nB): + # for f in range(nF): + # dYf[b, ids[b, f]] += dY[b] + # + # However, we avoid building that array for efficiency -- and just pass + # in the indices. + dY, ids = dY_ids + assert dY.ndim == 3 + assert dY.shape[1] == nO, dY.shape + assert dY.shape[2] == nP, dY.shape + # nB = dY.shape[0] + model.inc_grad( + "lower_pad", _backprop_precomputable_affine_padding(model, dY, ids) + ) + Xf = model.ops.reshape2f(X[ids], ids.shape[0], nF * nI) + + model.inc_grad("lower_b", dY.sum(axis=0)) # type: ignore + dY = model.ops.reshape2f(dY, dY.shape[0], nO * nP) + + Wopfi = W.transpose((1, 2, 0, 3)) + Wopfi = Wopfi.reshape((nO * nP, nF * nI)) + dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) + + dWopfi = model.ops.gemm(dY, Xf, trans1=True) + dWopfi = dWopfi.reshape((nO, nP, nF, nI)) + # (o, p, f, i) --> (f, o, p, i) + dWopfi = dWopfi.transpose((2, 0, 1, 3)) + model.inc_grad("W", dWopfi) + return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI) + + return Yf, backward + + +def _backprop_precomputable_affine_padding(model, dY, ids): + nB = dY.shape[0] + nF = model.get_dim("nF") + nP = model.get_dim("nP") + nO = model.get_dim("nO") + # Backprop the "padding", used as a filler for missing values. + # Values that are missing are set to -1, and each state vector could + # have multiple missing values. The padding has different values for + # different missing features. The gradient of the padding vector is: + # + # for b in range(nB): + # for f in range(nF): + # if ids[b, f] < 0: + # d_pad[f] += dY[b] + # + # Which can be rewritten as: + # + # (ids < 0).T @ dY + mask = model.ops.asarray(ids < 0, dtype="f") + d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True) + return d_pad.reshape((1, nF, nO, nP)) + + +def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]: + if Y is None: + return None + _, scores = Y + if len(scores) == 0: + return None + assert scores[0].shape[0] >= 1 + assert len(scores[0].shape) == 2 + return scores[0].shape[1] + + +def _lsuv_init(model): + """This is like the 'layer sequential unit variance', but instead + of taking the actual inputs, we randomly generate whitened data. + + Why's this all so complicated? We have a huge number of inputs, + and the maxout unit makes guessing the dynamics tricky. Instead + we set the maxout weights to values that empirically result in + whitened outputs given whitened inputs. + """ + # TODO + return None diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 206b82ef7..76999b736 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -208,70 +208,11 @@ cdef class Parser(TrainablePipe): # if labels are missing. We therefore have to check whether we need to # expand our model output. self._resize() - model = self.model.predict(docs) - batch = self.moves.init_batch(docs) - states = self._predict_states(model, batch) - model.clear_memory() - del model + states, scores = self.model.predict((docs, self.moves)) return states - def _predict_states(self, model, batch): - cdef vector[StateC*] states - cdef StateClass state - weights = get_c_weights(model) - for state in batch: - if not state.is_final(): - states.push_back(state.c) - sizes = get_c_sizes(model, states.size()) - with nogil: - self._parseC(&states[0], - weights, sizes) - return batch - def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.): - cdef Beam beam - cdef Doc doc - batch = _beam_utils.BeamBatch( - self.moves, - self.moves.init_batch(docs), - None, - beam_width, - density=beam_density - ) - # This is pretty dirty, but the NER can resize itself in init_batch, - # if labels are missing. We therefore have to check whether we need to - # expand our model output. - self._resize() - model = self.model.predict(docs) - while not batch.is_done: - states = batch.get_unfinished_states() - if not states: - break - scores = model.predict(states) - batch.advance(scores) - model.clear_memory() - del model - return list(batch) - - cdef void _parseC(self, StateC** states, - WeightsC weights, SizesC sizes) nogil: - cdef int i, j - cdef vector[StateC*] unfinished - cdef ActivationsC activations = alloc_activations(sizes) - while sizes.states >= 1: - predict_states(&activations, - states, &weights, sizes) - # Validate actions, argmax, take action. - self.c_transition_batch(states, - activations.scores, sizes.classes, sizes.states) - for i in range(sizes.states): - if not states[i].is_final(): - unfinished.push_back(states[i]) - for i in range(unfinished.size()): - states[i] = unfinished[i] - sizes.states = unfinished.size() - unfinished.clear() - free_activations(&activations) + raise NotImplementedError def set_annotations(self, docs, states_or_beams): cdef StateClass state @@ -283,36 +224,6 @@ cdef class Parser(TrainablePipe): for hook in self.postprocesses: hook(doc) - def transition_states(self, states, float[:, ::1] scores): - cdef StateClass state - cdef float* c_scores = &scores[0, 0] - cdef vector[StateC*] c_states - for state in states: - c_states.push_back(state.c) - self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0]) - return [state for state in states if not state.c.is_final()] - - cdef void c_transition_batch(self, StateC** states, const float* scores, - int nr_class, int batch_size) nogil: - # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc - with gil: - assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) - is_valid = calloc(self.moves.n_moves, sizeof(int)) - cdef int i, guess - cdef Transition action - for i in range(batch_size): - self.moves.set_valid(is_valid, states[i]) - guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class) - if guess == -1: - # This shouldn't happen, but it's hard to raise an error here, - # and we don't want to infinite loop. So, force to end state. - states[i].force_final() - else: - action = self.moves.c[guess] - action.do(states[i], action.label) - states[i].history.push_back(guess) - free(is_valid) - def update(self, examples, *, drop=0., sgd=None, losses=None): cdef StateClass state if losses is None: @@ -327,58 +238,48 @@ cdef class Parser(TrainablePipe): if n_examples == 0: return losses set_dropout_rate(self.model, drop) - # The probability we use beam update, instead of falling back to - # a greedy update - beam_update_prob = self.cfg["beam_update_prob"] - if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob: - return self.update_beam( - examples, - beam_width=self.cfg["beam_width"], - sgd=sgd, - losses=losses, - beam_density=self.cfg["beam_density"] - ) docs = [eg.x for eg in examples] - model, backprop_tok2vec = self.model.begin_update(docs) - states = self.moves.init_batch(docs) - self._predict_states(states) - # I've separated the prediction from getting the batch because - # I like the idea of trying to store the histories or maybe compute - # them in another process or something. Just walking the states - # and transitioning isn't expensive anyway. - ids, costs = self._get_ids_and_costs_from_histories( - examples, - [list(state.history) for state in states] - ) - scores, backprop_states = model.begin_update(ids) - d_scores = self.get_loss(scores, costs) - d_tokvecs = backprop_states(d_scores) - backprop_tok2vec(d_tokvecs) + (states, scores), backprop_scores = self.model.begin_update((docs, self.moves)) + d_scores = self.get_loss((states, scores), examples) + backprop_scores(d_scores) if sgd not in (None, False): self.finish_update(sgd) - self.set_annotations(docs, states) losses[self.name] += (d_scores**2).sum() # Ugh, this is annoying. If we're working on GPU, we want to free the # memory ASAP. It seems that Python doesn't necessarily get around to # removing these in time if we don't explicitly delete? It's confusing. - del backprop_states - del backprop_tok2vec - model.clear_memory() - del model + del backprop_scores return losses - def _get_ids_and_costs_from_histories(self, examples, histories): + def get_loss(self, states_scores, examples): + states, scores = states_scores + costs = self._get_costs_from_histories( + examples, + [list(state.history) for state in states] + ) + xp = get_array_module(scores) + best_costs = costs.min(axis=1, keepdims=True) + is_gold = costs <= costs.min(axis=1, keepdims=True) + gscores = scores[is_gold] + max_ = scores.max(axis=1) + gmax = gscores.max(axis=1, keepdims=True) + exp_scores = xp.exp(scores - max_) + exp_gscores = xp.exp(gscores - gmax) + Z = exp_scores.sum(axis=1, keepdims=True) + gZ = exp_gscores.sum(axis=1, keepdims=True) + d_scores = exp_scores / Z + d_scores[is_gold] -= exp_gscores / gZ + return d_scores + + def _get_costs_from_histories(self, examples, histories): cdef StateClass state cdef int clas cdef int nF = self.model.state2vec.nF cdef int nO = self.moves.n_moves cdef int nS = sum([len(history) for history in histories]) - # ids and costs have one row per state in the whole batch. - cdef np.ndarray ids = numpy.zeros((nS, nF), dtype="i") cdef np.ndarray costs = numpy.zeros((nS, nO), dtype="f") cdef Pool mem = Pool() is_valid = mem.alloc(nO, sizeof(int)) - c_ids = ids.data c_costs = costs.data states = self.moves.init_states([eg.x for eg in examples]) cdef int i = 0 @@ -394,92 +295,15 @@ cdef class Parser(TrainablePipe): i += 1 # If the model is on GPU, copy the costs to device. costs = self.model.ops.asarray(costs) - return ids, costs - - def get_loss(self, scores, costs): - xp = get_array_module(scores) - best_costs = costs.min(axis=1, keepdims=True) - is_gold = costs <= costs.min(axis=1, keepdims=True) - gscores = scores[is_gold] - max_ = scores.max(axis=1) - gmax = gscores.max(axis=1, keepdims=True) - exp_scores = xp.exp(scores - max_) - exp_gscores = xp.exp(gscores - gmax) - Z = exp_scores.sum(axis=1, keepdims=True) - gZ = exp_gscores.sum(axis=1, keepdims=True) - d_scores = exp_scores / Z - d_scores[is_gold] -= exp_gscores / gZ - return d_scores + return costs def rehearse(self, examples, sgd=None, losses=None, **cfg): """Perform a "rehearsal" update, to prevent catastrophic forgetting.""" - if losses is None: - losses = {} - for multitask in self._multitasks: - if hasattr(multitask, 'rehearse'): - multitask.rehearse(examples, losses=losses, sgd=sgd) - if self._rehearsal_model is None: - return None - losses.setdefault(self.name, 0.) - validate_examples(examples, "Parser.rehearse") - docs = [eg.predicted for eg in examples] - states = self.moves.init_batch(docs) - # This is pretty dirty, but the NER can resize itself in init_batch, - # if labels are missing. We therefore have to check whether we need to - # expand our model output. - self._resize() - # Prepare the stepwise model, and get the callback for finishing the batch - set_dropout_rate(self._rehearsal_model, 0.0) - set_dropout_rate(self.model, 0.0) - tutor, _ = self._rehearsal_model.begin_update(docs) - model, backprop_tok2vec = self.model.begin_update(docs) - n_scores = 0. - loss = 0. - while states: - targets, _ = tutor.begin_update(states) - guesses, backprop = model.begin_update(states) - d_scores = (guesses - targets) / targets.shape[0] - # If all weights for an output are 0 in the original model, don't - # supervise that output. This allows us to add classes. - loss += (d_scores**2).sum() - backprop(d_scores) - # Follow the predicted action - self.transition_states(states, guesses) - states = [state for state in states if not state.is_final()] - n_scores += d_scores.size - # Do the backprop - backprop_tok2vec(docs) - if sgd is not None: - self.finish_update(sgd) - losses[self.name] += loss / n_scores - del backprop - del backprop_tok2vec - model.clear_memory() - tutor.clear_memory() - del model - del tutor - return losses + raise NotImplementedError def update_beam(self, examples, *, beam_width, drop=0., sgd=None, losses=None, beam_density=0.0): - states, golds, _ = self.moves.init_gold_batch(examples) - if not states: - return losses - # Prepare the stepwise model, and get the callback for finishing the batch - model, backprop_tok2vec = self.model.begin_update( - [eg.predicted for eg in examples]) - loss = _beam_utils.update_beam( - self.moves, - states, - golds, - model, - beam_width, - beam_density=beam_density, - ) - losses[self.name] += loss - backprop_tok2vec(golds) - if sgd is not None: - self.finish_update(sgd) + raise NotImplementedError def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) From 34aab9899f4438f18fbc361ff3f14e02ce460aac Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 12:22:46 +0200 Subject: [PATCH 34/74] Prepare to remove parser_model.pyx --- spacy/ml/parser_model.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx index 72140401b..6aa8e8e9c 100644 --- a/spacy/ml/parser_model.pyx +++ b/spacy/ml/parser_model.pyx @@ -212,7 +212,7 @@ def ParserStepModel( tokvecs: Floats2d, bp_tokvecs: Callable, upper: Model[Floats2d, Floats2d], - dropout: float=0.1 + dropout: float=0.1, unseen_classes: Optional[List[int]]=None ) -> Model[Ints2d, Floats2d]: # TODO: Keep working on replacing all of this with just 'chain' From 7b9c2824696335f456e9216a657d47160ad6f294 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 12:28:13 +0200 Subject: [PATCH 35/74] Convert parser from cdef class --- setup.py | 5 +++-- spacy/pipeline/dep_parser.pyx | 6 +++--- spacy/pipeline/ner.pyx | 6 +++--- spacy/pipeline/transition_parser.pxd | 19 ------------------- spacy/pipeline/transition_parser.pyx | 21 +++++++++------------ 5 files changed, 18 insertions(+), 39 deletions(-) delete mode 100644 spacy/pipeline/transition_parser.pxd diff --git a/setup.py b/setup.py index fb659bcb0..a4663d070 100755 --- a/setup.py +++ b/setup.py @@ -30,7 +30,6 @@ MOD_NAMES = [ "spacy.vocab", "spacy.attrs", "spacy.kb", - "spacy.ml.parser_model", "spacy.morphology", "spacy.pipeline.dep_parser", "spacy.pipeline.morphologizer", @@ -203,7 +202,9 @@ def setup_package(): ext_modules = [] for name in MOD_NAMES: mod_path = name.replace(".", "/") + ".pyx" - ext = Extension(name, [mod_path], language="c++", extra_compile_args=["-std=c++11"]) + ext = Extension( + name, [mod_path], language="c++", extra_compile_args=["-std=c++11"] + ) ext_modules.append(ext) print("Cythonizing sources") ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES) diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 18c9fd25a..7bdb2849d 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -3,8 +3,8 @@ from collections import defaultdict from typing import Optional, Iterable from thinc.api import Model, Config -from .transition_parser cimport Parser -from ._parser_internals.arc_eager cimport ArcEager +from .transition_parser import Parser +from ._parser_internals.arc_eager import ArcEager from .functions import merge_subtokens from ..language import Language @@ -199,7 +199,7 @@ def make_beam_parser( ) -cdef class DependencyParser(Parser): +class DependencyParser(Parser): """Pipeline component for dependency parsing. DOCS: https://nightly.spacy.io/api/dependencyparser diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 0dfb055d3..cd2f9e1cf 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -3,8 +3,8 @@ from collections import defaultdict from typing import Optional, Iterable from thinc.api import Model, Config -from .transition_parser cimport Parser -from ._parser_internals.ner cimport BiluoPushDown +from .transition_parser import Parser +from ._parser_internals.ner import BiluoPushDown from ..language import Language from ..scorer import get_ner_prf, PRFScore @@ -160,7 +160,7 @@ def make_beam_ner( ) -cdef class EntityRecognizer(Parser): +class EntityRecognizer(Parser): """Pipeline component for named entity recognition. DOCS: https://nightly.spacy.io/api/entityrecognizer diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd deleted file mode 100644 index bd5bad334..000000000 --- a/spacy/pipeline/transition_parser.pxd +++ /dev/null @@ -1,19 +0,0 @@ -from cymem.cymem cimport Pool - -from ..vocab cimport Vocab -from .trainable_pipe cimport TrainablePipe -from ._parser_internals.transition_system cimport Transition, TransitionSystem -from ._parser_internals._state cimport StateC -from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC - - -cdef class Parser(TrainablePipe): - cdef public object _rehearsal_model - cdef readonly TransitionSystem moves - cdef public object _multitasks - - cdef void _parseC(self, StateC** states, - WeightsC weights, SizesC sizes) nogil - - cdef void c_transition_batch(self, StateC** states, const float* scores, - int nr_class, int batch_size) nogil diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 76999b736..c86a32a12 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -17,21 +17,19 @@ import numpy import warnings from ._parser_internals.stateclass cimport StateClass -from ..ml.parser_model cimport alloc_activations, free_activations -from ..ml.parser_model cimport predict_states, arg_max_if_valid -from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss -from ..ml.parser_model cimport get_c_weights, get_c_sizes from ..tokens.doc cimport Doc from .trainable_pipe import TrainablePipe from ._parser_internals cimport _beam_utils from ._parser_internals import _beam_utils +from ..vocab cimport Vocab +from ._parser_internals.transition_system cimport TransitionSystem from ..training import validate_examples, validate_get_examples from ..errors import Errors, Warnings from .. import util -cdef class Parser(TrainablePipe): +class Parser(TrainablePipe): """ Base class of the DependencyParser and EntityRecognizer. """ @@ -272,24 +270,23 @@ cdef class Parser(TrainablePipe): return d_scores def _get_costs_from_histories(self, examples, histories): + cdef TransitionSystem moves = self.moves cdef StateClass state cdef int clas cdef int nF = self.model.state2vec.nF - cdef int nO = self.moves.n_moves + cdef int nO = moves.n_moves cdef int nS = sum([len(history) for history in histories]) cdef np.ndarray costs = numpy.zeros((nS, nO), dtype="f") cdef Pool mem = Pool() is_valid = mem.alloc(nO, sizeof(int)) c_costs = costs.data - states = self.moves.init_states([eg.x for eg in examples]) + states = moves.init_states([eg.x for eg in examples]) cdef int i = 0 for eg, state, history in zip(examples, states, histories): - gold = self.moves.init_gold(state, eg) + gold = moves.init_gold(state, eg) for clas in history: - # Set a row into the C data of the arrays (which we return) - state.c.set_context_tokens(&c_ids[i*nF], nF) - self.moves.set_costs(is_valid, &c_costs[i*nO], state.c, gold) - action = self.moves.c[clas] + moves.set_costs(is_valid, &c_costs[i*nO], state.c, gold) + action = moves.c[clas] action.do(state.c, action.label) state.c.history.push_back(clas) i += 1 From 9b459f9ef2c13fc9283e810719e168b1f1ef1c23 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 12:28:31 +0200 Subject: [PATCH 36/74] Delete spacy.ml.parser_model --- spacy/ml/parser_model.pxd | 48 ------ spacy/ml/parser_model.pyx | 337 -------------------------------------- 2 files changed, 385 deletions(-) delete mode 100644 spacy/ml/parser_model.pxd delete mode 100644 spacy/ml/parser_model.pyx diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd deleted file mode 100644 index 6582b3468..000000000 --- a/spacy/ml/parser_model.pxd +++ /dev/null @@ -1,48 +0,0 @@ -from libc.string cimport memset, memcpy -from ..typedefs cimport weight_t, hash_t -from ..pipeline._parser_internals._state cimport StateC - - -cdef struct SizesC: - int states - int classes - int hiddens - int pieces - int feats - int embed_width - - -cdef struct WeightsC: - const float* feat_weights - const float* feat_bias - const float* hidden_bias - const float* hidden_weights - const float* seen_classes - - -cdef struct ActivationsC: - int* token_ids - float* unmaxed - float* scores - float* hiddens - int* is_valid - int _curr_size - int _max_size - - -cdef WeightsC get_c_weights(model) except * - -cdef SizesC get_c_sizes(model, int batch_size) except * - -cdef ActivationsC alloc_activations(SizesC n) nogil - -cdef void free_activations(const ActivationsC* A) nogil - -cdef void predict_states(ActivationsC* A, StateC** states, - const WeightsC* W, SizesC n) nogil - -cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil - -cdef void cpu_log_loss(float* d_scores, - const float* costs, const int* is_valid, const float* scores, int O) nogil - diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx deleted file mode 100644 index 6aa8e8e9c..000000000 --- a/spacy/ml/parser_model.pyx +++ /dev/null @@ -1,337 +0,0 @@ -# cython: infer_types=True, cdivision=True, boundscheck=False -cimport numpy as np -from libc.math cimport exp -from libc.string cimport memset, memcpy -from libc.stdlib cimport calloc, free, realloc -from thinc.backends.linalg cimport Vec, VecVec -cimport blis.cy - -import numpy -import numpy.random -from thinc.api import Model, CupyOps, NumpyOps - -from .. import util -from ..typedefs cimport weight_t, class_t, hash_t -from ..pipeline._parser_internals.stateclass cimport StateClass - - -cdef WeightsC get_c_weights(model) except *: - cdef WeightsC output - cdef precompute_hiddens state2vec = model.state2vec - cdef np.ndarray bias = state2vec.bias - output.feat_weights = state2vec.get_feat_weights() - output.feat_bias = bias.data - cdef np.ndarray vec2scores_W - cdef np.ndarray vec2scores_b - if model.vec2scores is None: - output.hidden_weights = NULL - output.hidden_bias = NULL - else: - vec2scores_W = model.vec2scores.get_param("W") - vec2scores_b = model.vec2scores.get_param("b") - output.hidden_weights = vec2scores_W.data - output.hidden_bias = vec2scores_b.data - cdef np.ndarray class_mask = model._class_mask - output.seen_classes = class_mask.data - return output - - -cdef SizesC get_c_sizes(model, int batch_size) except *: - cdef SizesC output - output.states = batch_size - if model.vec2scores is None: - output.classes = model.state2vec.get_dim("nO") - else: - output.classes = model.vec2scores.get_dim("nO") - output.hiddens = model.state2vec.get_dim("nO") - output.pieces = model.state2vec.get_dim("nP") - output.feats = model.state2vec.get_dim("nF") - output.embed_width = model.tokvecs.shape[1] - return output - - -cdef ActivationsC alloc_activations(SizesC n) nogil: - cdef ActivationsC A - memset(&A, 0, sizeof(A)) - resize_activations(&A, n) - return A - - -cdef void free_activations(const ActivationsC* A) nogil: - free(A.token_ids) - free(A.scores) - free(A.unmaxed) - free(A.hiddens) - free(A.is_valid) - - -cdef void resize_activations(ActivationsC* A, SizesC n) nogil: - if n.states <= A._max_size: - A._curr_size = n.states - return - if A._max_size == 0: - A.token_ids = calloc(n.states * n.feats, sizeof(A.token_ids[0])) - A.scores = calloc(n.states * n.classes, sizeof(A.scores[0])) - A.unmaxed = calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0])) - A.hiddens = calloc(n.states * n.hiddens, sizeof(A.hiddens[0])) - A.is_valid = calloc(n.states * n.classes, sizeof(A.is_valid[0])) - A._max_size = n.states - else: - A.token_ids = realloc(A.token_ids, - n.states * n.feats * sizeof(A.token_ids[0])) - A.scores = realloc(A.scores, - n.states * n.classes * sizeof(A.scores[0])) - A.unmaxed = realloc(A.unmaxed, - n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])) - A.hiddens = realloc(A.hiddens, - n.states * n.hiddens * sizeof(A.hiddens[0])) - A.is_valid = realloc(A.is_valid, - n.states * n.classes * sizeof(A.is_valid[0])) - A._max_size = n.states - A._curr_size = n.states - - -cdef void predict_states(ActivationsC* A, StateC** states, - const WeightsC* W, SizesC n) nogil: - cdef double one = 1.0 - resize_activations(A, n) - for i in range(n.states): - states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats) - memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float)) - memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float)) - sum_state_features(A.unmaxed, - W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces) - for i in range(n.states): - VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces], - W.feat_bias, 1., n.hiddens * n.pieces) - for j in range(n.hiddens): - index = i * n.hiddens * n.pieces + j * n.pieces - which = Vec.arg_max(&A.unmaxed[index], n.pieces) - A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which] - memset(A.scores, 0, n.states * n.classes * sizeof(float)) - if W.hidden_weights == NULL: - memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float)) - else: - # Compute hidden-to-output - blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE, - n.states, n.classes, n.hiddens, one, - A.hiddens, n.hiddens, 1, - W.hidden_weights, n.hiddens, 1, - one, - A.scores, n.classes, 1) - # Add bias - for i in range(n.states): - VecVec.add_i(&A.scores[i*n.classes], - W.hidden_bias, 1., n.classes) - # Set unseen classes to minimum value - i = 0 - min_ = A.scores[0] - for i in range(1, n.states * n.classes): - if A.scores[i] < min_: - min_ = A.scores[i] - for i in range(n.states): - for j in range(n.classes): - if not W.seen_classes[j]: - A.scores[i*n.classes+j] = min_ - - -cdef void sum_state_features(float* output, - const float* cached, const int* token_ids, int B, int F, int O) nogil: - cdef int idx, b, f, i - cdef const float* feature - padding = cached - cached += F * O - cdef int id_stride = F*O - cdef float one = 1. - for b in range(B): - for f in range(F): - if token_ids[f] < 0: - feature = &padding[f*O] - else: - idx = token_ids[f] * id_stride + f*O - feature = &cached[idx] - blis.cy.axpyv(blis.cy.NO_CONJUGATE, O, one, - feature, 1, - &output[b*O], 1) - token_ids += F - - -cdef void cpu_log_loss(float* d_scores, - const float* costs, const int* is_valid, const float* scores, - int O) nogil: - """Do multi-label log loss""" - cdef double max_, gmax, Z, gZ - best = arg_max_if_gold(scores, costs, is_valid, O) - guess = Vec.arg_max(scores, O) - if best == -1 or guess == -1: - # These shouldn't happen, but if they do, we want to make sure we don't - # cause an OOB access. - return - Z = 1e-10 - gZ = 1e-10 - max_ = scores[guess] - gmax = scores[best] - for i in range(O): - Z += exp(scores[i] - max_) - if costs[i] <= costs[best]: - gZ += exp(scores[i] - gmax) - for i in range(O): - if costs[i] <= costs[best]: - d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ) - else: - d_scores[i] = exp(scores[i]-max_) / Z - - -cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, - const int* is_valid, int n) nogil: - # Find minimum cost - cdef float cost = 1 - for i in range(n): - if is_valid[i] and costs[i] < cost: - cost = costs[i] - # Now find best-scoring with that cost - cdef int best = -1 - for i in range(n): - if costs[i] <= cost and is_valid[i]: - if best == -1 or scores[i] > scores[best]: - best = i - return best - - -cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil: - cdef int best = -1 - for i in range(n): - if is_valid[i] >= 1: - if best == -1 or scores[i] > scores[best]: - best = i - return best - - - -def ParserStepModel( - tokvecs: Floats2d, - bp_tokvecs: Callable, - upper: Model[Floats2d, Floats2d], - dropout: float=0.1, - unseen_classes: Optional[List[int]]=None -) -> Model[Ints2d, Floats2d]: - # TODO: Keep working on replacing all of this with just 'chain' - state2vec = precompute_hiddens( - tokvecs, - bp_tokvecs - ) - class_mask = numpy.zeros((self.nO,), dtype='f') - class_mask.fill(1) - if unseen_classes is not None: - for class_ in unseen_classes: - class_mask[class_] = 0. - - return _ParserStepModel( - "ParserStep", - step_forward, - init=None, - dims={"nO": upper.get_dim("nO")}, - layers=[state2vec, upper], - attrs={ - "tokvecs": tokvecs, - "bp_tokvecs": bp_tokvecs, - "dropout_rate": dropout, - "class_mask": class_mask - } - ) - - -class _ParserStepModel(Model): - # TODO: Remove need for all this stuff, so we can normalize this - def class_is_unseen(self, class_): - return self._class_mask[class_] - - def mark_class_unseen(self, class_): - self._class_mask[class_] = 0 - - def mark_class_seen(self, class_): - self._class_mask[class_] = 1 - - def get_token_ids(self, states): - cdef StateClass state - states = [state for state in states if not state.is_final()] - cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF), - dtype='i', order='C') - ids.fill(-1) - c_ids = ids.data - for state in states: - state.c.set_context_tokens(c_ids, ids.shape[1]) - c_ids += ids.shape[1] - return ids - - -def step_forward(model: _ParserStepModel, token_ids, is_train): - # TODO: Eventually we hopefully can get rid of all of this? - # If we make the 'class_mask' thing its own layer, we can just - # have chain() here, right? - state2vec, upper = model.layers - vector, get_d_tokvecs = state2vec(token_ids, is_train) - mask = None - vec2scores = ensure_same_device(model.ops, vec2scores) - dropout_rate = model.attrs["dropout_rate"] - if is_train and dropout_rate > 0: - mask = model.ops.get_dropout_mask(vector.shape, dropout_rate) - vector *= mask - scores, get_d_vector = vec2scores(vector, is_train) - # If the class is unseen, make sure its score is minimum - class_mask = model.attrs["class_mask"] - scores[:, class_mask == 0] = model.ops.xp.nanmin(scores) - - def backprop_parser_step(d_scores): - # Zero vectors for unseen classes - d_scores *= model._class_mask - d_vector = get_d_vector(d_scores) - if mask is not None: - d_vector *= mask - return get_d_tokvecs(d_vector) - - return scores, backprop_parser_step - - -def precompute_hiddens(lower_model, feat_weights: Floats3d, bp_hiddens: Callable) -> Model: - return Model( - "precompute_hiddens", - init=None, - forward=_precompute_forward, - dims={ - "nO": feat_weights.shape[2], - "nP": lower_model.get_dim("nP") if lower_model.has_dim("nP") else 1, - "nF": cached.shape[1] - }, - ops=lower_model.ops - ) - - -def _precomputed_forward( - model: Model[Ints2d, Floats2d], - token_ids: Ints2d, - is_train: bool -) -> Tuple[Floats2d, Callable]: - nO = model.get_dim("nO") - nP = model.get_dim("nP") - bp_hiddens = model.attrs["bp_hiddens"] - feat_weights = model.attrs["feat_weights"] - bias = model.attrs["bias"] - hidden = model.ops.alloc2f( - token_ids.shape[0], - nO * nP - ) - # TODO: This is probably wrong, right? - model.ops.scatter_add( - hidden, - feat_weights, - token_ids - ) - statevec, mask = model.ops.maxout(hidden.reshape((-1, nO, nP))) - - def backward(d_statevec): - return bp_hiddens( - model.ops.backprop_maxout(d_statevec, mask, nP) - ) - - return statevec, backward From 0279aa036a91fd9c8f8d85661f701ae8d3e7cb51 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 12:28:57 +0200 Subject: [PATCH 37/74] Delete _precomputable_affine module --- spacy/ml/_precomputable_affine.py | 155 ------------------------------ 1 file changed, 155 deletions(-) delete mode 100644 spacy/ml/_precomputable_affine.py diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py deleted file mode 100644 index f5e5cd8ad..000000000 --- a/spacy/ml/_precomputable_affine.py +++ /dev/null @@ -1,155 +0,0 @@ -from thinc.api import Model, normal_init - - -def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): - model = Model( - "precomputable_affine", - forward, - init=init, - dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP}, - params={"W": None, "b": None, "pad": None}, - attrs={"dropout_rate": dropout}, - ) - return model - - -def forward(model, X, is_train): - nF = model.get_dim("nF") - nO = model.get_dim("nO") - nP = model.get_dim("nP") - nI = model.get_dim("nI") - W = model.get_param("W") - Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True) - Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) - Yf = model.ops.xp.vstack((model.get_param("pad"), Yf)) - - def backward(dY_ids): - # This backprop is particularly tricky, because we get back a different - # thing from what we put out. We put out an array of shape: - # (nB, nF, nO, nP), and get back: - # (nB, nO, nP) and ids (nB, nF) - # The ids tell us the values of nF, so we would have: - # - # dYf = zeros((nB, nF, nO, nP)) - # for b in range(nB): - # for f in range(nF): - # dYf[b, ids[b, f]] += dY[b] - # - # However, we avoid building that array for efficiency -- and just pass - # in the indices. - dY, ids = dY_ids - assert dY.ndim == 3 - assert dY.shape[1] == nO, dY.shape - assert dY.shape[2] == nP, dY.shape - # nB = dY.shape[0] - model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids)) - Xf = X[ids] - Xf = Xf.reshape((Xf.shape[0], nF * nI)) - - model.inc_grad("b", dY.sum(axis=0)) - dY = dY.reshape((dY.shape[0], nO * nP)) - - Wopfi = W.transpose((1, 2, 0, 3)) - Wopfi = Wopfi.reshape((nO * nP, nF * nI)) - dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) - - dWopfi = model.ops.gemm(dY, Xf, trans1=True) - dWopfi = dWopfi.reshape((nO, nP, nF, nI)) - # (o, p, f, i) --> (f, o, p, i) - dWopfi = dWopfi.transpose((2, 0, 1, 3)) - model.inc_grad("W", dWopfi) - return dXf.reshape((dXf.shape[0], nF, nI)) - - return Yf, backward - - -def _backprop_precomputable_affine_padding(model, dY, ids): - nB = dY.shape[0] - nF = model.get_dim("nF") - nP = model.get_dim("nP") - nO = model.get_dim("nO") - # Backprop the "padding", used as a filler for missing values. - # Values that are missing are set to -1, and each state vector could - # have multiple missing values. The padding has different values for - # different missing features. The gradient of the padding vector is: - # - # for b in range(nB): - # for f in range(nF): - # if ids[b, f] < 0: - # d_pad[f] += dY[b] - # - # Which can be rewritten as: - # - # (ids < 0).T @ dY - mask = model.ops.asarray(ids < 0, dtype="f") - d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True) - return d_pad.reshape((1, nF, nO, nP)) - - -def init(model, X=None, Y=None): - """This is like the 'layer sequential unit variance', but instead - of taking the actual inputs, we randomly generate whitened data. - - Why's this all so complicated? We have a huge number of inputs, - and the maxout unit makes guessing the dynamics tricky. Instead - we set the maxout weights to values that empirically result in - whitened outputs given whitened inputs. - """ - if model.has_param("W") and model.get_param("W").any(): - return - - nF = model.get_dim("nF") - nO = model.get_dim("nO") - nP = model.get_dim("nP") - nI = model.get_dim("nI") - W = model.ops.alloc4f(nF, nO, nP, nI) - b = model.ops.alloc2f(nO, nP) - pad = model.ops.alloc4f(1, nF, nO, nP) - - ops = model.ops - W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI))) - pad = normal_init(ops, pad.shape, mean=1.0) - model.set_param("W", W) - model.set_param("b", b) - model.set_param("pad", pad) - - ids = ops.alloc((5000, nF), dtype="f") - ids += ops.xp.random.uniform(0, 1000, ids.shape) - ids = ops.asarray(ids, dtype="i") - tokvecs = ops.alloc((5000, nI), dtype="f") - tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape( - tokvecs.shape - ) - - def predict(ids, tokvecs): - # nS ids. nW tokvecs. Exclude the padding array. - hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p) - vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f") - # need nS vectors - hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP)) - model.ops.scatter_add(vectors, ids.flatten(), hiddens) - vectors = vectors.reshape((vectors.shape[0], nO, nP)) - vectors += b - vectors = model.ops.asarray(vectors) - if nP >= 2: - return model.ops.maxout(vectors)[0] - else: - return vectors * (vectors >= 0) - - tol_var = 0.01 - tol_mean = 0.01 - t_max = 10 - W = model.get_param("W").copy() - b = model.get_param("b").copy() - for t_i in range(t_max): - acts1 = predict(ids, tokvecs) - var = model.ops.xp.var(acts1) - mean = model.ops.xp.mean(acts1) - if abs(var - 1.0) >= tol_var: - W /= model.ops.xp.sqrt(var) - model.set_param("W", W) - elif abs(mean) >= tol_mean: - b -= mean - model.set_param("b", b) - else: - break From 71abe2e42dd634d05b6b43564c11114368b5af86 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 12:50:20 +0200 Subject: [PATCH 38/74] Wire up tb_framework to new parser model --- spacy/ml/tb_framework.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 1e14d239e..ddc283216 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,5 +1,5 @@ from typing import List, Tuple, Any, Optional -from thinc.api import Ops, Model, normal_init +from thinc.api import Ops, Model, normal_init, chain, list2array, Linear from thinc.types import Floats1d, Floats2d, Floats3d, Ints2d, Floats4d from ..tokens.doc import Doc @@ -20,11 +20,15 @@ def TransitionModel( """Set up a transition-based parsing model, using a maxout hidden layer and a linear output layer. """ + t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None + tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) + tok2vec_projected.set_dim("nO", hidden_width) + return Model( name="parser_model", forward=forward, init=init, - layers=[tok2vec], + layers=[tok2vec_projected], refs={"tok2vec": tok2vec}, params={ "lower_W": None, # Floats2d W for the hidden layer From 45ca12f07aabd227916b7dbe87ce07c4c7698b79 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 12:50:33 +0200 Subject: [PATCH 39/74] Wire up parser model --- spacy/ml/models/parser.py | 140 +++++++++----------------------------- 1 file changed, 34 insertions(+), 106 deletions(-) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index da53f562e..fd476382f 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -1,13 +1,15 @@ -from typing import Optional, List -from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops +from typing import Optional, List, Tuple, Any from thinc.types import Floats2d +from thinc.api import Model from ...errors import Errors from ...compat import Literal from ...util import registry -from .._precomputable_affine import PrecomputableAffine from ..tb_framework import TransitionModel -from ...tokens import Doc +from ...tokens.doc import Doc + +TransitionSystem = Any # TODO +State = Any # TODO @registry.architectures.register("spacy.TransitionBasedParser.v1") @@ -19,7 +21,7 @@ def transition_parser_v1( maxout_pieces: int, use_upper: bool = True, nO: Optional[int] = None, -) -> Model: +) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]: return build_tb_parser_model( tok2vec, state_type, @@ -47,8 +49,26 @@ def transition_parser_v2( extra_state_tokens, hidden_width, maxout_pieces, - use_upper, - nO, + nO=nO, + ) + + +@registry.architectures.register("spacy.TransitionBasedParser.v3") +def transition_parser_v2( + tok2vec: Model[List[Doc], List[Floats2d]], + state_type: Literal["parser", "ner"], + extra_state_tokens: bool, + hidden_width: int, + maxout_pieces: int, + nO: Optional[int] = None, +) -> Model: + return build_tb_parser_model( + tok2vec, + state_type, + extra_state_tokens, + hidden_width, + maxout_pieces, + nO=nO, ) @@ -58,7 +78,6 @@ def build_tb_parser_model( extra_state_tokens: bool, hidden_width: int, maxout_pieces: int, - use_upper: bool, nO: Optional[int] = None, ) -> Model: """ @@ -110,102 +129,11 @@ def build_tb_parser_model( nr_feature_tokens = 6 if extra_state_tokens else 3 else: raise ValueError(Errors.E917.format(value=state_type)) - t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) - tok2vec.set_dim("nO", hidden_width) - lower = _define_lower( - nO=hidden_width if use_upper else nO, - nF=nr_feature_tokens, - nI=tok2vec.get_dim("nO"), - nP=maxout_pieces, + return TransitionModel( + tok2vec=tok2vec, + state_tokens=nr_feature_tokens, + hidden_width=hidden_width, + maxout_pieces=maxout_pieces, + nO=nO, + unseen_classes=set(), ) - upper = None - if use_upper: - with use_ops("numpy"): - # Initialize weights at zero, as it's a classification layer. - upper = _define_upper(nO=nO, nI=None) - return TransitionModel(tok2vec, lower, upper, resize_output) - - -def _define_upper(nO, nI): - return Linear(nO=nO, nI=nI, init_W=zero_init) - - -def _define_lower(nO, nF, nI, nP): - return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP) - - -def resize_output(model, new_nO): - if model.attrs["has_upper"]: - return _resize_upper(model, new_nO) - return _resize_lower(model, new_nO) - - -def _resize_upper(model, new_nO): - upper = model.get_ref("upper") - if upper.has_dim("nO") is None: - upper.set_dim("nO", new_nO) - return model - elif new_nO == upper.get_dim("nO"): - return model - - smaller = upper - nI = smaller.maybe_get_dim("nI") - with use_ops("numpy"): - larger = _define_upper(nO=new_nO, nI=nI) - # it could be that the model is not initialized yet, then skip this bit - if smaller.has_param("W"): - larger_W = larger.ops.alloc2f(new_nO, nI) - larger_b = larger.ops.alloc1f(new_nO) - smaller_W = smaller.get_param("W") - smaller_b = smaller.get_param("b") - # Weights are stored in (nr_out, nr_in) format, so we're basically - # just adding rows here. - if smaller.has_dim("nO"): - old_nO = smaller.get_dim("nO") - larger_W[:old_nO] = smaller_W - larger_b[:old_nO] = smaller_b - for i in range(old_nO, new_nO): - model.attrs["unseen_classes"].add(i) - - larger.set_param("W", larger_W) - larger.set_param("b", larger_b) - model._layers[-1] = larger - model.set_ref("upper", larger) - return model - - -def _resize_lower(model, new_nO): - lower = model.get_ref("lower") - if lower.has_dim("nO") is None: - lower.set_dim("nO", new_nO) - return model - - smaller = lower - nI = smaller.maybe_get_dim("nI") - nF = smaller.maybe_get_dim("nF") - nP = smaller.maybe_get_dim("nP") - larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP) - # it could be that the model is not initialized yet, then skip this bit - if smaller.has_param("W"): - larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI) - larger_b = larger.ops.alloc2f(new_nO, nP) - larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP) - smaller_W = smaller.get_param("W") - smaller_b = smaller.get_param("b") - smaller_pad = smaller.get_param("pad") - # Copy the old weights and padding into the new layer - if smaller.has_dim("nO"): - old_nO = smaller.get_dim("nO") - larger_W[:, 0:old_nO, :, :] = smaller_W - larger_pad[:, :, 0:old_nO, :] = smaller_pad - larger_b[0:old_nO, :] = smaller_b - for i in range(old_nO, new_nO): - model.attrs["unseen_classes"].add(i) - - larger.set_param("W", larger_W) - larger.set_param("b", larger_b) - larger.set_param("pad", larger_pad) - model._layers[1] = larger - model.set_ref("lower", larger) - return model From 1921e8681340722287b55dc8558c6315cfa5fc5e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 12:51:14 +0200 Subject: [PATCH 40/74] Uncython ner.pyx and dep_parser.pyx --- spacy/pipeline/{dep_parser.pyx => dep_parser.py} | 0 spacy/pipeline/{ner.pyx => ner.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename spacy/pipeline/{dep_parser.pyx => dep_parser.py} (100%) rename spacy/pipeline/{ner.pyx => ner.py} (100%) diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.py similarity index 100% rename from spacy/pipeline/dep_parser.pyx rename to spacy/pipeline/dep_parser.py diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.py similarity index 100% rename from spacy/pipeline/ner.pyx rename to spacy/pipeline/ner.py From 9c4a04d0c5dc2246e0703d34c62925f2fee94b01 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 12:51:32 +0200 Subject: [PATCH 41/74] Uncython --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index a4663d070..3904593dc 100755 --- a/setup.py +++ b/setup.py @@ -31,10 +31,8 @@ MOD_NAMES = [ "spacy.attrs", "spacy.kb", "spacy.morphology", - "spacy.pipeline.dep_parser", "spacy.pipeline.morphologizer", "spacy.pipeline.multitask", - "spacy.pipeline.ner", "spacy.pipeline.pipe", "spacy.pipeline.trainable_pipe", "spacy.pipeline.sentencizer", From 03018904efccc98213130927e03f40517c650fd0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 16:11:58 +0200 Subject: [PATCH 42/74] Work on parser model --- spacy/ml/tb_framework.py | 169 +++++++++++++++------------------------ 1 file changed, 65 insertions(+), 104 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index ddc283216..714a4e43e 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -21,7 +21,7 @@ def TransitionModel( layer and a linear output layer. """ t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) + tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) # type: ignore tok2vec_projected.set_dim("nO", hidden_width) return Model( @@ -47,17 +47,28 @@ def TransitionModel( attrs={ "unseen_classes": set(unseen_classes), "resize_output": resize_output, - "make_step_model": make_step_model, }, ) -def make_step_model(model: Model) -> Model[List[State], Floats2d]: - ... - - -def resize_output(model: Model) -> Model: - ... +def resize_output(model: Model, new_nO: int) -> Model: + old_nO = model.maybe_get_dim("nO") + if old_nO is None: + model.set_dim("nO", new_nO) + return model + elif new_nO <= old_nO: + return model + elif model.has_param("upper_W"): + nH = model.get_dim("nH") + new_W = model.ops.alloc2f(new_nO, nH) + new_b = model.ops.alloc1f(new_nO) + old_W = model.get_param("upper_W") + old_b = model.get_param("upper_b") + new_W[:old_nO] = old_W # type: ignore + new_b[:old_nO] = old_b # type: ignore + for i in range(old_nO, new_nO): + model.attrs["unseen_classes"].add(i) + return model def init( @@ -87,9 +98,9 @@ def init( padl = ops.alloc4f(1, nF, nH, nP) Wu = ops.alloc2f(nO, nH) bu = ops.alloc1f(nO) - Wl = normal_init(ops, Wl.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI))) - padl = normal_init(ops, padl.shape, mean=1.0) - # TODO: Experiment with whether better to initialize Wu + Wl = normal_init(ops, Wl.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI))) # type: ignore + padl = normal_init(ops, padl.shape, mean=1.0) # type: ignore + # TODO: Experiment with whether better to initialize upper_W model.set_param("lower_W", Wl) model.set_param("lower_b", bl) model.set_param("lower_pad", padl) @@ -101,11 +112,11 @@ def init( def forward(model, docs_moves, is_train): tok2vec = model.get_ref("tok2vec") - state2scores = model.get_ref("state2scores") - # Get a reference to the parameters. We need to work with - # stable references through the forward/backward pass, to make - # sure we don't have a stale reference if there's concurrent shenanigans. - params = {name: model.get_param(name) for name in model.param_names} + lower_pad = model.get_param("lower_pad") + lower_b = model.get_param("lower_b") + upper_W = model.get_param("upper_W") + upper_b = model.get_param("upper_b") + ops = model.ops docs, moves = docs_moves states = moves.init_batch(docs) @@ -113,108 +124,58 @@ def forward(model, docs_moves, is_train): feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train) memory = [] all_scores = [] - while states: - states, scores, memory = _step_parser( - ops, params, moves, states, feats, memory, is_train - ) + next_states = list(states) + while next_states: + ids = moves.get_state_ids(states) + preacts = _sum_state_features(feats, lower_pad, ids) + # * Add the bias + preacts += lower_b + # * Apply the activation (maxout) + statevecs, which = ops.maxout(preacts) + # * Multiply the state-vector by the scores weights + scores = ops.gemm(statevecs, upper_W, trans2=True) + # * Add the bias + scores += upper_b + next_states = moves.transition_states(states, scores) all_scores.append(scores) + if is_train: + memory.append((ids, statevecs, which)) def backprop_parser(d_states_d_scores): _, d_scores = d_states_d_scores - d_feats, ids = _backprop_parser_steps(ops, params, memory, d_scores) - d_tokvecs = backprop_feats((d_feats, ids)) - return backprop_tok2vec(d_tokvecs), None + ids, statevecs, whiches = [ops.xp.concatenate(*item) for item in zip(*memory)] + # TODO: Unseen class masking + # Calculate the gradients for the parameters of the upper layer. + model.inc_grad("upper_b", d_scores.sum(axis=0)) + model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True)) + # Now calculate d_statevecs, by backproping through the upper linear layer. + d_statevecs = model.ops.gemm(d_scores, upper_W) + # Backprop through the maxout activation + d_preacts = model.ops.backprop_maxount( + d_statevecs, whiches, model.get_dim("nP") + ) + # We don't need to backprop the summation, because we pass back the IDs instead + d_tokvecs = backprop_feats((d_preacts, ids)) + return (backprop_tok2vec(d_tokvecs), None) return (states, all_scores), backprop_parser -def _step_parser(ops, params, moves, states, feats, memory, is_train): - ids = moves.get_state_ids(states) - statevecs, which, scores = _score_ids(ops, params, ids, feats, is_train) - next_states = moves.transition_states(states, scores) - if is_train: - memory.append((ids, statevecs, which)) - return next_states, scores, memory - - -def _score_ids(ops, params, ids, feats, is_train): - lower_pad = params["lower_pad"] - lower_b = params["lower_b"] - upper_W = params["upper_W"] - upper_b = params["upper_b"] - # During each step of the parser, we do: - # * Index into the features, to get the pre-activated vector - # for each (token, feature) and sum the feature vectors - preacts = _sum_state_features(feats, lower_pad, ids) - # * Add the bias - preacts += lower_b - # * Apply the activation (maxout) - statevecs, which = ops.maxout(preacts) - # * Multiply the state-vector by the scores weights - scores = ops.gemm(statevecs, upper_W, trans2=True) - # * Add the bias - scores += upper_b - # * Apply the is-class-unseen masking - # TODO - return statevecs, which, scores - - -def _sum_state_features(ops: Ops, feats: Floats3d, ids: Ints2d) -> Floats2d: +def _sum_state_features(ops: Ops, feats: Floats3d, ids: Ints2d, _arange=[]) -> Floats2d: # Here's what we're trying to implement here: # # for i in range(ids.shape[0]): # for j in range(ids.shape[1]): # output[i] += feats[ids[i, j], j] # - # Reshape the feats into 2d, to make indexing easier. Instead of getting an - # array of indices where the cell at (4, 2) needs to refer to the row at - # feats[4, 2], we'll translate the index so that it directly addresses - # feats[18]. This lets us make the indices array 1d, leading to fewer - # numpy shennanigans. - feats2d = ops.reshape2f(feats, feats.shape[0] * feats.shape[1], feats.shape[2]) - # Now translate the ids. If we're looking for the row that used to be at - # (4, 1) and we have 4 features, we'll find it at (4*4)+1=17. - oob_ids = ids < 0 # Retain the -1 values - ids = ids * feats.shape[1] + ops.xp.arange(feats.shape[1]) - ids[oob_ids] = -1 - unsummed2d = feats2d[ops.reshape1i(ids, ids.size)] - unsummed3d = ops.reshape3f( - unsummed2d, feats.shape[0], feats.shape[1], feats.shape[2] - ) - summed = unsummed3d.sum(axis=1) # type: ignore - return summed - - -def _process_memory(ops, memory): - """Concatenate the memory buffers from each state into contiguous - buffers for the whole batch. - """ - return [ops.xp.concatenate(*item) for item in zip(*memory)] - - -def _backprop_parser_steps(model, upper_W, memory, d_scores): - # During each step of the parser, we do: - # * Index into the features, to get the pre-activated vector - # for each (token, feature) - # * Sum the feature vectors - # * Add the bias - # * Apply the activation (maxout) - # * Multiply the state-vector by the scores weights - # * Add the bias - # * Apply the is-class-unseen masking - # - # So we have to backprop through all those steps. - ids, statevecs, whiches = _process_memory(model.ops, memory) - # TODO: Unseen class masking - # Calculate the gradients for the parameters of the upper layer. - model.inc_grad("upper_b", d_scores.sum(axis=0)) - model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True)) - # Now calculate d_statevecs, by backproping through the upper linear layer. - d_statevecs = model.ops.gemm(d_scores, upper_W) - # Backprop through the maxout activation - d_preacts = model.ops.backprop_maxount(d_statevecs, whiches, model.get_dim("nP")) - # We don't need to backprop the summation, because we pass back the IDs instead - return d_preacts, ids + # The arange thingy here is highly weird to me, but apparently + # it's how it works. If you squint a bit at the loop above I guess + # it makes sense? + if not _arange: + _arange.append(ops.xp.arange(ids.shape[1])) + if _arange[0].size != ids.shape[1]: + _arange[0] = ops.xp.arange(ids.shape[1]) + return feats[ids, _arange[0]].sum(axis=1) # type: ignore def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): From 4b5d1b53f65980e090e283fdd9db9b38ee8bd0fd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 22:21:17 +0200 Subject: [PATCH 43/74] Support unseen_classes in parser model --- spacy/ml/tb_framework.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 714a4e43e..9cb93c9a2 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -122,38 +122,46 @@ def forward(model, docs_moves, is_train): states = moves.init_batch(docs) tokvecs, backprop_tok2vec = tok2vec(docs, is_train) feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train) - memory = [] + all_ids = [] + all_which = [] + all_statevecs = [] all_scores = [] next_states = list(states) + unseen_mask = _get_unseen_mask(model) while next_states: ids = moves.get_state_ids(states) + # Sum the state features, add the bias and apply the activation (maxout) + # to create the state vectors. preacts = _sum_state_features(feats, lower_pad, ids) - # * Add the bias preacts += lower_b - # * Apply the activation (maxout) statevecs, which = ops.maxout(preacts) - # * Multiply the state-vector by the scores weights + # Multiply the state-vector by the scores weights and add the bias, + # to get the logits. scores = ops.gemm(statevecs, upper_W, trans2=True) - # * Add the bias scores += upper_b + scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores) + # Transition the states, filtering out any that are finished. next_states = moves.transition_states(states, scores) all_scores.append(scores) if is_train: - memory.append((ids, statevecs, which)) + # Remember intermediate results for the backprop. + all_ids.append(ids) + all_statevecs.append(statevecs) + all_which.append(which) def backprop_parser(d_states_d_scores): _, d_scores = d_states_d_scores - ids, statevecs, whiches = [ops.xp.concatenate(*item) for item in zip(*memory)] - # TODO: Unseen class masking + d_scores *= unseen_mask + ids = ops.xp.concatenate(all_ids) + statevecs = ops.xp.concatenate(all_statevecs) + which = ops.xp.concatenate(all_which) # Calculate the gradients for the parameters of the upper layer. model.inc_grad("upper_b", d_scores.sum(axis=0)) model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True)) # Now calculate d_statevecs, by backproping through the upper linear layer. d_statevecs = model.ops.gemm(d_scores, upper_W) # Backprop through the maxout activation - d_preacts = model.ops.backprop_maxount( - d_statevecs, whiches, model.get_dim("nP") - ) + d_preacts = model.ops.backprop_maxount(d_statevecs, which, model.get_dim("nP")) # We don't need to backprop the summation, because we pass back the IDs instead d_tokvecs = backprop_feats((d_preacts, ids)) return (backprop_tok2vec(d_tokvecs), None) @@ -161,6 +169,14 @@ def forward(model, docs_moves, is_train): return (states, all_scores), backprop_parser +def _get_unseen_mask(model: Model) -> Floats1d: + mask = model.ops.alloc1f(model.get_dim("nO")) + mask.fill(1) + for class_ in model.attrs.get("unseen_classes", set()): + mask[class_] = 0 + return mask + + def _sum_state_features(ops: Ops, feats: Floats3d, ids: Ints2d, _arange=[]) -> Floats2d: # Here's what we're trying to implement here: # From 07a3581ff85a6992a2d802501c809674b145ee27 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 22:26:52 +0200 Subject: [PATCH 44/74] Support unseen classes in parser --- spacy/pipeline/transition_parser.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index c86a32a12..1bf2140ab 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -267,6 +267,12 @@ class Parser(TrainablePipe): gZ = exp_gscores.sum(axis=1, keepdims=True) d_scores = exp_scores / Z d_scores[is_gold] -= exp_gscores / gZ + if "unseen_classes" in model.attrs: + for i in range(costs.shape[0]): + for clas in range(costs.shape[1]): + if costs[i, clas] <= best_costs[i, 0]: + if clas in model.attrs["unseen_classes"]: + model.attrs["unseen_classes"].remove(clas) return d_scores def _get_costs_from_histories(self, examples, histories): From d765a4f8ee81d4dacb41044344f35a5ed5972e05 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Oct 2021 22:34:29 +0200 Subject: [PATCH 45/74] Cleaner handling of unseen classes --- spacy/ml/tb_framework.py | 7 +++++++ spacy/pipeline/transition_parser.pyx | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 9cb93c9a2..006d5a384 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -151,6 +151,13 @@ def forward(model, docs_moves, is_train): def backprop_parser(d_states_d_scores): _, d_scores = d_states_d_scores + if model.attrs.get("unseen_classes"): + # If we have a negative gradient (i.e. the probability should + # increase) on any classes we filtered out as unseen, mark + # them as seen. + for clas in set(model.attrs["unseen_classes"]): + if (d_scores[:, clas] < 0).any(): + model.attrs["unseen_classes"].remove(clas) d_scores *= unseen_mask ids = ops.xp.concatenate(all_ids) statevecs = ops.xp.concatenate(all_statevecs) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 1bf2140ab..c86a32a12 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -267,12 +267,6 @@ class Parser(TrainablePipe): gZ = exp_gscores.sum(axis=1, keepdims=True) d_scores = exp_scores / Z d_scores[is_gold] -= exp_gscores / gZ - if "unseen_classes" in model.attrs: - for i in range(costs.shape[0]): - for clas in range(costs.shape[1]): - if costs[i, clas] <= best_costs[i, 0]: - if clas in model.attrs["unseen_classes"]: - model.attrs["unseen_classes"].remove(clas) return d_scores def _get_costs_from_histories(self, examples, histories): From c538eaf1c8137fc6fcd076c9272b020d5558ae56 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 26 Oct 2021 01:21:51 +0200 Subject: [PATCH 46/74] Work through tests --- spacy/ml/tb_framework.py | 44 +++++++++++-------- .../pipeline/_parser_internals/stateclass.pyx | 3 ++ .../_parser_internals/transition_system.pyx | 39 ++++++++++++++++ spacy/pipeline/transition_parser.pyx | 9 ++-- spacy/tests/test_misc.py | 2 - 5 files changed, 72 insertions(+), 25 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 006d5a384..35549c373 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,6 +1,7 @@ from typing import List, Tuple, Any, Optional from thinc.api import Ops, Model, normal_init, chain, list2array, Linear from thinc.types import Floats1d, Floats2d, Floats3d, Ints2d, Floats4d +import numpy from ..tokens.doc import Doc @@ -29,7 +30,7 @@ def TransitionModel( forward=forward, init=init, layers=[tok2vec_projected], - refs={"tok2vec": tok2vec}, + refs={"tok2vec": tok2vec_projected}, params={ "lower_W": None, # Floats2d W for the hidden layer "lower_b": None, # Floats1d bias for the hidden layer @@ -77,8 +78,10 @@ def init( Y: Optional[Tuple[List[State], List[Floats2d]]] = None, ): if X is not None: - docs, states = X + docs, moves = X model.get_ref("tok2vec").initialize(X=docs) + else: + model.get_ref("tok2vec").initialize() inferred_nO = _infer_nO(Y) if inferred_nO is not None: current_nO = model.maybe_get_dim("nO") @@ -110,7 +113,8 @@ def init( _lsuv_init(model) -def forward(model, docs_moves, is_train): +def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool): + nF = model.get_dim("nF") tok2vec = model.get_ref("tok2vec") lower_pad = model.get_param("lower_pad") lower_b = model.get_param("lower_b") @@ -126,13 +130,16 @@ def forward(model, docs_moves, is_train): all_which = [] all_statevecs = [] all_scores = [] - next_states = list(states) + next_states = [s for s in states if not s.is_final()] unseen_mask = _get_unseen_mask(model) + ids = numpy.zeros((len(states), nF), dtype="i") while next_states: - ids = moves.get_state_ids(states) + ids = ids[: len(next_states)] + for i, state in enumerate(next_states): + state.set_context_tokens(ids, i, nF) # Sum the state features, add the bias and apply the activation (maxout) # to create the state vectors. - preacts = _sum_state_features(feats, lower_pad, ids) + preacts = _sum_state_features(ops, feats, ids) preacts += lower_b statevecs, which = ops.maxout(preacts) # Multiply the state-vector by the scores weights and add the bias, @@ -141,7 +148,7 @@ def forward(model, docs_moves, is_train): scores += upper_b scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores) # Transition the states, filtering out any that are finished. - next_states = moves.transition_states(states, scores) + next_states = moves.transition_states(next_states, scores) all_scores.append(scores) if is_train: # Remember intermediate results for the backprop. @@ -204,24 +211,23 @@ def _sum_state_features(ops: Ops, feats: Floats3d, ids: Ints2d, _arange=[]) -> F def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): W: Floats4d = model.get_param("lower_W") - b: Floats2d = model.get_param("lower_b") pad: Floats4d = model.get_param("lower_pad") nF = model.get_dim("nF") - nO = model.get_dim("nO") + nH = model.get_dim("nH") nP = model.get_dim("nP") nI = model.get_dim("nI") - Yf_ = model.ops.gemm(X, model.ops.reshape2f(W, nF * nO * nP, nI), trans2=True) - Yf = model.ops.reshape4f(Yf_, Yf_.shape[0], nF, nO, nP) + Yf_ = model.ops.gemm(X, model.ops.reshape2f(W, nF * nH * nP, nI), trans2=True) + Yf = model.ops.reshape4f(Yf_, Yf_.shape[0], nF, nH, nP) Yf = model.ops.xp.vstack((Yf, pad)) def backward(dY_ids: Tuple[Floats3d, Ints2d]): # This backprop is particularly tricky, because we get back a different # thing from what we put out. We put out an array of shape: - # (nB, nF, nO, nP), and get back: - # (nB, nO, nP) and ids (nB, nF) + # (nB, nF, nH, nP), and get back: + # (nB, nH, nP) and ids (nB, nF) # The ids tell us the values of nF, so we would have: # - # dYf = zeros((nB, nF, nO, nP)) + # dYf = zeros((nB, nF, nH, nP)) # for b in range(nB): # for f in range(nF): # dYf[b, ids[b, f]] += dY[b] @@ -230,7 +236,7 @@ def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): # in the indices. dY, ids = dY_ids assert dY.ndim == 3 - assert dY.shape[1] == nO, dY.shape + assert dY.shape[1] == nH, dY.shape assert dY.shape[2] == nP, dY.shape # nB = dY.shape[0] model.inc_grad( @@ -239,14 +245,14 @@ def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): Xf = model.ops.reshape2f(X[ids], ids.shape[0], nF * nI) model.inc_grad("lower_b", dY.sum(axis=0)) # type: ignore - dY = model.ops.reshape2f(dY, dY.shape[0], nO * nP) + dY = model.ops.reshape2f(dY, dY.shape[0], nH * nP) Wopfi = W.transpose((1, 2, 0, 3)) - Wopfi = Wopfi.reshape((nO * nP, nF * nI)) - dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) + Wopfi = Wopfi.reshape((nH * nP, nF * nI)) + dXf = model.ops.gemm(dY.reshape((dY.shape[0], nH * nP)), Wopfi) dWopfi = model.ops.gemm(dY, Xf, trans1=True) - dWopfi = dWopfi.reshape((nO, nP, nF, nI)) + dWopfi = dWopfi.reshape((nH, nP, nF, nI)) # (o, p, f, i) --> (f, o, p, i) dWopfi = dWopfi.transpose((2, 0, 1, 3)) model.inc_grad("W", dWopfi) diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx index 208cf061e..dbd22117e 100644 --- a/spacy/pipeline/_parser_internals/stateclass.pyx +++ b/spacy/pipeline/_parser_internals/stateclass.pyx @@ -180,3 +180,6 @@ cdef class StateClass: def clone(self, StateClass src): self.c.clone(src.c) + + def set_context_tokens(self, int[:, :] output, int row, int n_feats): + self.c.set_context_tokens(&output[row, 0], n_feats) diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 181cffd8d..79eceb9ff 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -1,6 +1,8 @@ # cython: infer_types=True from __future__ import print_function from cymem.cymem cimport Pool +from libc.stdlib cimport calloc, free +from libcpp.vector cimport vector from collections import Counter import srsly @@ -141,6 +143,16 @@ cdef class TransitionSystem: action.do(state.c, action.label) state.c.history.push_back(action.clas) + def transition_states(self, states, float[:, ::1] scores): + assert len(states) == scores.shape[0] + cdef StateClass state + cdef float* c_scores = &scores[0, 0] + cdef vector[StateC*] c_states + for state in states: + c_states.push_back(state.c) + c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0]) + return [state for state in states if not state.c.is_final()] + cdef Transition lookup_transition(self, object name) except *: raise NotImplementedError @@ -250,3 +262,30 @@ cdef class TransitionSystem: msg = util.from_bytes(bytes_data, deserializers, exclude) self.initialize_actions(labels) return self + + +cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores, + int nr_class, int batch_size) nogil: + is_valid = calloc(moves.n_moves, sizeof(int)) + cdef int i, guess + cdef Transition action + for i in range(batch_size): + moves.set_valid(is_valid, states[i]) + guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class) + if guess == -1: + # This shouldn't happen, but it's hard to raise an error here, + # and we don't want to infinite loop. So, force to end state. + states[i].force_final() + else: + action = moves.c[guess] + action.do(states[i], action.label) + free(is_valid) + + +cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil: + cdef int best = -1 + for i in range(n): + if is_valid[i] >= 1: + if best == -1 or scores[i] > scores[best]: + best = i + return best diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index c86a32a12..8d2f25fa0 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -92,8 +92,9 @@ class Parser(TrainablePipe): @property def move_names(self): names = [] + cdef TransitionSystem moves = self.moves for i in range(self.moves.n_moves): - name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label) + name = self.moves.move_name(moves.c[i].move, moves.c[i].label) # Explicitly removing the internal "U-" token used for blocking entities if name != "U-": names.append(name) @@ -273,14 +274,14 @@ class Parser(TrainablePipe): cdef TransitionSystem moves = self.moves cdef StateClass state cdef int clas - cdef int nF = self.model.state2vec.nF + cdef int nF = self.model.get_dim("nF") cdef int nO = moves.n_moves cdef int nS = sum([len(history) for history in histories]) cdef np.ndarray costs = numpy.zeros((nS, nO), dtype="f") cdef Pool mem = Pool() is_valid = mem.alloc(nO, sizeof(int)) c_costs = costs.data - states = moves.init_states([eg.x for eg in examples]) + states = moves.init_batch([eg.x for eg in examples]) cdef int i = 0 for eg, state, history in zip(examples, states, histories): gold = moves.init_gold(state, eg) @@ -342,7 +343,7 @@ class Parser(TrainablePipe): for example in islice(get_examples(), 10): doc_sample.append(example.predicted) assert len(doc_sample) > 0, Errors.E923.format(name=self.name) - self.model.initialize(doc_sample) + self.model.initialize((doc_sample, self.moves)) if nlp is not None: self.init_multitask_objectives(get_examples, nlp.pipeline) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index bdb2b9752..125adbd37 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -5,8 +5,6 @@ from pathlib import Path from spacy.about import __version__ as spacy_version from spacy import util from spacy import prefer_gpu, require_gpu, require_cpu -from spacy.ml._precomputable_affine import PrecomputableAffine -from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding from spacy.util import dot_to_object, SimpleFrozenList from thinc.api import Config, Optimizer, ConfigValidationError from spacy.training.batchers import minibatch_by_words From b67dd0cf8965af48b23b9722ce24b7610eb86f85 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 Oct 2021 17:10:33 +0200 Subject: [PATCH 47/74] Keep working through errors --- spacy/ml/tb_framework.py | 30 +++++-------------- .../_parser_internals/transition_system.pyx | 1 + spacy/pipeline/transition_parser.pyx | 14 +++++---- 3 files changed, 17 insertions(+), 28 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 35549c373..1846c4d1e 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -133,13 +133,14 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo next_states = [s for s in states if not s.is_final()] unseen_mask = _get_unseen_mask(model) ids = numpy.zeros((len(states), nF), dtype="i") + arange = model.ops.xp.arange(nF) while next_states: ids = ids[: len(next_states)] for i, state in enumerate(next_states): state.set_context_tokens(ids, i, nF) # Sum the state features, add the bias and apply the activation (maxout) # to create the state vectors. - preacts = _sum_state_features(ops, feats, ids) + preacts = feats[ids, arange].sum(axis=1) # type: ignore preacts += lower_b statevecs, which = ops.maxout(preacts) # Multiply the state-vector by the scores weights and add the bias, @@ -152,7 +153,7 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo all_scores.append(scores) if is_train: # Remember intermediate results for the backprop. - all_ids.append(ids) + all_ids.append(ids.copy()) all_statevecs.append(statevecs) all_which.append(which) @@ -175,7 +176,7 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo # Now calculate d_statevecs, by backproping through the upper linear layer. d_statevecs = model.ops.gemm(d_scores, upper_W) # Backprop through the maxout activation - d_preacts = model.ops.backprop_maxount(d_statevecs, which, model.get_dim("nP")) + d_preacts = model.ops.backprop_maxout(d_statevecs, which, model.get_dim("nP")) # We don't need to backprop the summation, because we pass back the IDs instead d_tokvecs = backprop_feats((d_preacts, ids)) return (backprop_tok2vec(d_tokvecs), None) @@ -191,23 +192,6 @@ def _get_unseen_mask(model: Model) -> Floats1d: return mask -def _sum_state_features(ops: Ops, feats: Floats3d, ids: Ints2d, _arange=[]) -> Floats2d: - # Here's what we're trying to implement here: - # - # for i in range(ids.shape[0]): - # for j in range(ids.shape[1]): - # output[i] += feats[ids[i, j], j] - # - # The arange thingy here is highly weird to me, but apparently - # it's how it works. If you squint a bit at the loop above I guess - # it makes sense? - if not _arange: - _arange.append(ops.xp.arange(ids.shape[1])) - if _arange[0].size != ids.shape[1]: - _arange[0] = ops.xp.arange(ids.shape[1]) - return feats[ids, _arange[0]].sum(axis=1) # type: ignore - - def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): W: Floats4d = model.get_param("lower_W") @@ -265,7 +249,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids): nB = dY.shape[0] nF = model.get_dim("nF") nP = model.get_dim("nP") - nO = model.get_dim("nO") + nH = model.get_dim("nH") # Backprop the "padding", used as a filler for missing values. # Values that are missing are set to -1, and each state vector could # have multiple missing values. The padding has different values for @@ -280,8 +264,8 @@ def _backprop_precomputable_affine_padding(model, dY, ids): # # (ids < 0).T @ dY mask = model.ops.asarray(ids < 0, dtype="f") - d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True) - return d_pad.reshape((1, nF, nO, nP)) + d_pad = model.ops.gemm(mask, dY.reshape(nB, nH * nP), trans1=True) + return d_pad.reshape((1, nF, nH, nP)) def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]: diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 79eceb9ff..7632a1993 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -279,6 +279,7 @@ cdef void c_transition_batch(TransitionSystem moves, StateC** states, const floa else: action = moves.c[guess] action.do(states[i], action.label) + states[i].history.push_back(guess) free(is_valid) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 8d2f25fa0..d9135b5d4 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -239,8 +239,10 @@ class Parser(TrainablePipe): set_dropout_rate(self.model, drop) docs = [eg.x for eg in examples] (states, scores), backprop_scores = self.model.begin_update((docs, self.moves)) + if sum(s.shape[0] for s in scores) == 0: + return losses d_scores = self.get_loss((states, scores), examples) - backprop_scores(d_scores) + backprop_scores((states, d_scores)) if sgd not in (None, False): self.finish_update(sgd) losses[self.name] += (d_scores**2).sum() @@ -252,22 +254,24 @@ class Parser(TrainablePipe): def get_loss(self, states_scores, examples): states, scores = states_scores + scores = self.model.ops.xp.vstack(scores) costs = self._get_costs_from_histories( examples, [list(state.history) for state in states] ) xp = get_array_module(scores) best_costs = costs.min(axis=1, keepdims=True) - is_gold = costs <= costs.min(axis=1, keepdims=True) - gscores = scores[is_gold] - max_ = scores.max(axis=1) + gscores = scores.copy() + min_score = scores.min() + gscores[costs > best_costs] = min_score + max_ = scores.max(axis=1, keepdims=True) gmax = gscores.max(axis=1, keepdims=True) exp_scores = xp.exp(scores - max_) exp_gscores = xp.exp(gscores - gmax) Z = exp_scores.sum(axis=1, keepdims=True) gZ = exp_gscores.sum(axis=1, keepdims=True) d_scores = exp_scores / Z - d_scores[is_gold] -= exp_gscores / gZ + d_scores -= (costs <= best_costs) * (exp_gscores / gZ) return d_scores def _get_costs_from_histories(self, examples, histories): From af9a30b1927116b568928e31011d06a5ff3c34c7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 Oct 2021 17:13:11 +0200 Subject: [PATCH 48/74] Keep working through errors --- spacy/ml/tb_framework.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 1846c4d1e..906884e87 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -226,6 +226,10 @@ def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): model.inc_grad( "lower_pad", _backprop_precomputable_affine_padding(model, dY, ids) ) + print("X", X.shape) + print("ids", ids.shape) + print("dims", "nF", "nI") + print("X[ids]", X[ids].shape) Xf = model.ops.reshape2f(X[ids], ids.shape[0], nF * nI) model.inc_grad("lower_b", dY.sum(axis=0)) # type: ignore From 880182afdbaf9c85e33238d31ab862656c9cf00f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 Oct 2021 23:02:29 +0200 Subject: [PATCH 49/74] Work on parser. 15 tests failing --- spacy/ml/tb_framework.py | 30 ++++++----- spacy/pipeline/transition_parser.pyx | 1 + .../tests/serialize/test_serialize_config.py | 52 +++++-------------- 3 files changed, 30 insertions(+), 53 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 906884e87..207f4bd5d 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -42,7 +42,7 @@ def TransitionModel( "nO": None, # Output size "nP": maxout_pieces, "nH": hidden_width, - "nI": tok2vec.maybe_get_dim("nO"), + "nI": tok2vec_projected.maybe_get_dim("nO"), "nF": state_tokens, }, attrs={ @@ -69,6 +69,9 @@ def resize_output(model: Model, new_nO: int) -> Model: new_b[:old_nO] = old_b # type: ignore for i in range(old_nO, new_nO): model.attrs["unseen_classes"].add(i) + model.set_param("upper_W", new_W) + model.set_param("upper_b", new_b) + model.set_dim("nO", new_nO, force=True) return model @@ -167,9 +170,8 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo if (d_scores[:, clas] < 0).any(): model.attrs["unseen_classes"].remove(clas) d_scores *= unseen_mask - ids = ops.xp.concatenate(all_ids) - statevecs = ops.xp.concatenate(all_statevecs) - which = ops.xp.concatenate(all_which) + statevecs = ops.xp.vstack(all_statevecs) + which = ops.xp.vstack(all_which) # Calculate the gradients for the parameters of the upper layer. model.inc_grad("upper_b", d_scores.sum(axis=0)) model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True)) @@ -178,8 +180,12 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo # Backprop through the maxout activation d_preacts = model.ops.backprop_maxout(d_statevecs, which, model.get_dim("nP")) # We don't need to backprop the summation, because we pass back the IDs instead - d_tokvecs = backprop_feats((d_preacts, ids)) - return (backprop_tok2vec(d_tokvecs), None) + d_state_features = backprop_feats((d_preacts, all_ids)) + ids1d = model.ops.xp.vstack(all_ids).flatten() + d_state_features = d_state_features.reshape((ids1d.size, -1)) + d_tokvecs = model.ops.alloc((tokvecs.shape[0] + 1, tokvecs.shape[1])) + model.ops.scatter_add(d_tokvecs, ids1d, d_state_features) + return (backprop_tok2vec(d_tokvecs[:-1]), None) return (states, all_scores), backprop_parser @@ -200,6 +206,7 @@ def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): nH = model.get_dim("nH") nP = model.get_dim("nP") nI = model.get_dim("nI") + assert X.shape == (X.shape[0], nI), X.shape Yf_ = model.ops.gemm(X, model.ops.reshape2f(W, nF * nH * nP, nI), trans2=True) Yf = model.ops.reshape4f(Yf_, Yf_.shape[0], nF, nH, nP) Yf = model.ops.xp.vstack((Yf, pad)) @@ -226,19 +233,13 @@ def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): model.inc_grad( "lower_pad", _backprop_precomputable_affine_padding(model, dY, ids) ) - print("X", X.shape) - print("ids", ids.shape) - print("dims", "nF", "nI") - print("X[ids]", X[ids].shape) - Xf = model.ops.reshape2f(X[ids], ids.shape[0], nF * nI) - model.inc_grad("lower_b", dY.sum(axis=0)) # type: ignore dY = model.ops.reshape2f(dY, dY.shape[0], nH * nP) - Wopfi = W.transpose((1, 2, 0, 3)) Wopfi = Wopfi.reshape((nH * nP, nF * nI)) dXf = model.ops.gemm(dY.reshape((dY.shape[0], nH * nP)), Wopfi) - + ids1d = model.ops.xp.vstack(ids).flatten() + Xf = model.ops.reshape2f(X[ids1d], -1, nF * nI) dWopfi = model.ops.gemm(dY, Xf, trans1=True) dWopfi = dWopfi.reshape((nH, nP, nF, nI)) # (o, p, f, i) --> (f, o, p, i) @@ -250,6 +251,7 @@ def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): def _backprop_precomputable_affine_padding(model, dY, ids): + ids = model.ops.xp.vstack(ids) nB = dY.shape[0] nF = model.get_dim("nF") nP = model.get_dim("nP") diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index d9135b5d4..047805239 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -263,6 +263,7 @@ class Parser(TrainablePipe): best_costs = costs.min(axis=1, keepdims=True) gscores = scores.copy() min_score = scores.min() + assert costs.shape == scores.shape, (costs.shape, scores.shape) gscores[costs > best_costs] = min_score max_ = scores.max(axis=1, keepdims=True) gmax = gscores.max(axis=1, keepdims=True) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 6709defb8..ef650d7cd 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -120,33 +120,11 @@ width = ${components.tok2vec.model.width} parser_config_string_upper = """ [model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "parser" extra_state_tokens = false hidden_width = 66 maxout_pieces = 2 -use_upper = true - -[model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = null -width = 333 -depth = 4 -embed_size = 5555 -window_size = 1 -maxout_pieces = 7 -subword_features = false -""" - - -parser_config_string_no_upper = """ -[model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "parser" -extra_state_tokens = false -hidden_width = 66 -maxout_pieces = 2 -use_upper = false [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" @@ -177,7 +155,6 @@ def my_parser(): extra_state_tokens=True, hidden_width=65, maxout_pieces=5, - use_upper=True, ) return parser @@ -264,15 +241,14 @@ def test_serialize_custom_nlp(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - model.get_ref("tok2vec") - # check that we have the correct settings, not the default ones - assert model.get_ref("upper").get_dim("nI") == 65 - assert model.get_ref("lower").get_dim("nI") == 65 + assert model.get_ref("tok2vec") is not None + assert model.has_param("lower_W") + assert model.has_param("upper_W") + assert model.has_param("lower_b") + assert model.has_param("upper_b") -@pytest.mark.parametrize( - "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper] -) +@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper]) def test_serialize_parser(parser_config_string): """ Create a non-default parser config to check nlp serializes it correctly """ nlp = English() @@ -285,11 +261,11 @@ def test_serialize_parser(parser_config_string): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - model.get_ref("tok2vec") - # check that we have the correct settings, not the default ones - if model.attrs["has_upper"]: - assert model.get_ref("upper").get_dim("nI") == 66 - assert model.get_ref("lower").get_dim("nI") == 66 + assert model.get_ref("tok2vec") is not None + assert model.has_param("lower_W") + assert model.has_param("upper_W") + assert model.has_param("lower_b") + assert model.has_param("upper_b") def test_config_nlp_roundtrip(): @@ -436,9 +412,7 @@ def test_config_auto_fill_extra_fields(): load_model_from_config(nlp.config) -@pytest.mark.parametrize( - "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper] -) +@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper]) def test_config_validate_literal(parser_config_string): nlp = English() config = Config().from_str(parser_config_string) From 7309e49286dc780e7d33dc46a96a820a843749eb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 Oct 2021 23:21:55 +0200 Subject: [PATCH 50/74] Xfail beam stuff. 9 failures --- spacy/tests/parser/test_nn_beam.py | 2 ++ spacy/tests/parser/test_parse.py | 8 ++++++-- spacy/tests/regression/test_issue4001-4500.py | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index 4ba020ef0..6e87c5fba 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -118,6 +118,7 @@ def test_beam_advance_too_few_scores(beam, scores): beam.advance(scores[:-1]) +@pytest.mark.xfail(reason="no beam parser yet") def test_beam_parse(examples, beam_width): nlp = Language() parser = nlp.add_pipe("beam_parser") @@ -128,6 +129,7 @@ def test_beam_parse(examples, beam_width): parser(doc) +@pytest.mark.xfail(reason="no beam parser yet") @hypothesis.given(hyp=hypothesis.strategies.data()) def test_beam_density(moves, examples, beam_width, hyp): beam_density = float(hyp.draw(hypothesis.strategies.floats(0.0, 1.0, width=32))) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index dc878dd7a..64c71f821 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -55,6 +55,8 @@ PARTIAL_DATA = [ ), ] +PARSERS = ["parser"] # TODO: Test beam_parser when ready + eps = 0.1 @@ -215,7 +217,7 @@ def test_parser_set_sent_starts(en_vocab): assert token.head in sent -@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) +@pytest.mark.parametrize("pipe_name", PARSERS) def test_incomplete_data(pipe_name): # Test that the parser works with incomplete information nlp = English() @@ -241,7 +243,7 @@ def test_incomplete_data(pipe_name): assert doc[2].head.i == 1 -@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) +@pytest.mark.parametrize("pipe_name", PARSERS) def test_overfitting_IO(pipe_name): # Simple test to try and quickly overfit the dependency parser (normal or beam) nlp = English() @@ -292,6 +294,7 @@ def test_overfitting_IO(pipe_name): assert_equal(batch_deps_1, no_batch_deps) +@pytest.mark.xfail(reason="no beam parser yet") def test_beam_parser_scores(): # Test that we can get confidence values out of the beam_parser pipe beam_width = 16 @@ -330,6 +333,7 @@ def test_beam_parser_scores(): assert 0 - eps <= head_score <= 1 + eps +@pytest.mark.xfail(reason="no beam parser yet") def test_beam_overfitting_IO(): # Simple test to try and quickly overfit the Beam dependency parser nlp = English() diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 25982623f..5f65faee4 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -287,6 +287,7 @@ def test_multiple_predictions(): dummy_pipe(doc) +@pytest.mark.xfail(reason="no beam parser yet") def test_issue4313(): """ This should not crash or exit with some strange error code """ beam_width = 16 From 6b5302cdf36bb0232df898375939cf91ee5c59c5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 Oct 2021 23:24:33 +0200 Subject: [PATCH 51/74] More xfail. 7 failures --- spacy/tests/parser/test_ner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index dffdff1ec..b22d2deee 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -359,6 +359,7 @@ def test_overfitting_IO(use_upper): assert_equal(batch_deps_1, no_batch_deps) +@pytest.mark.xfail(reason="no beam parser yet") def test_beam_ner_scores(): # Test that we can get confidence values out of the beam_ner pipe beam_width = 16 @@ -394,6 +395,7 @@ def test_beam_ner_scores(): assert 0 - eps <= score <= 1 + eps +@pytest.mark.xfail(reason="no beam parser yet") def test_beam_overfitting_IO(): # Simple test to try and quickly overfit the Beam NER component nlp = English() From 79d5957c47fbdef9857f403af21afe06145bff5e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 Oct 2021 23:26:07 +0200 Subject: [PATCH 52/74] Xfail. 6 failures --- spacy/tests/test_misc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 125adbd37..587365bfe 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -50,6 +50,7 @@ def test_util_get_package_path(package): assert isinstance(path, Path) +@pytest.mark.xfail(reason="No precomputable affine") def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize() assert model.get_param("W").shape == (nF, nO, nP, nI) From 753f9ee68581be917d26e5a7cf7cea95be8e4e43 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 29 Oct 2021 13:25:15 +0200 Subject: [PATCH 53/74] cleanup --- spacy/cli/templates/quickstart_training.jinja | 12 ++--- spacy/ml/models/parser.py | 52 +------------------ spacy/pipeline/dep_parser.py | 16 ++++-- spacy/pipeline/ner.py | 29 ++++++++--- spacy/tests/parser/test_ner.py | 12 ++--- website/docs/api/architectures.md | 24 ++++----- website/docs/usage/embeddings-transformers.md | 6 +-- 7 files changed, 57 insertions(+), 94 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index ab1d69894..ff190804c 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -75,12 +75,11 @@ grad_factor = 1.0 factory = "parser" [components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "parser" extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 -use_upper = false nO = null [components.parser.model.tok2vec] @@ -96,12 +95,11 @@ grad_factor = 1.0 factory = "ner" [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "ner" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 -use_upper = false nO = null [components.ner.model.tok2vec] @@ -257,12 +255,11 @@ width = ${components.tok2vec.model.encode.width} factory = "parser" [components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "parser" extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 -use_upper = true nO = null [components.parser.model.tok2vec] @@ -275,12 +272,11 @@ width = ${components.tok2vec.model.encode.width} factory = "ner" [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "ner" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 -use_upper = true nO = null [components.ner.model.tok2vec] diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index fd476382f..bbc5bf957 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -12,49 +12,8 @@ TransitionSystem = Any # TODO State = Any # TODO -@registry.architectures.register("spacy.TransitionBasedParser.v1") -def transition_parser_v1( - tok2vec: Model[List[Doc], List[Floats2d]], - state_type: Literal["parser", "ner"], - extra_state_tokens: bool, - hidden_width: int, - maxout_pieces: int, - use_upper: bool = True, - nO: Optional[int] = None, -) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]: - return build_tb_parser_model( - tok2vec, - state_type, - extra_state_tokens, - hidden_width, - maxout_pieces, - use_upper, - nO, - ) - - -@registry.architectures.register("spacy.TransitionBasedParser.v2") -def transition_parser_v2( - tok2vec: Model[List[Doc], List[Floats2d]], - state_type: Literal["parser", "ner"], - extra_state_tokens: bool, - hidden_width: int, - maxout_pieces: int, - use_upper: bool, - nO: Optional[int] = None, -) -> Model: - return build_tb_parser_model( - tok2vec, - state_type, - extra_state_tokens, - hidden_width, - maxout_pieces, - nO=nO, - ) - - @registry.architectures.register("spacy.TransitionBasedParser.v3") -def transition_parser_v2( +def transition_parser_v3( tok2vec: Model[List[Doc], List[Floats2d]], state_type: Literal["parser", "ner"], extra_state_tokens: bool, @@ -111,14 +70,7 @@ def build_tb_parser_model( feature sets (for the NER) or 13 (for the parser). hidden_width (int): The width of the hidden layer. maxout_pieces (int): How many pieces to use in the state prediction layer. - Recommended values are 1, 2 or 3. If 1, the maxout non-linearity - is replaced with a ReLu non-linearity if use_upper=True, and no - non-linearity if use_upper=False. - use_upper (bool): Whether to use an additional hidden layer after the state - vector in order to predict the action scores. It is recommended to set - this to False for large pretrained models such as transformers, and False - for smaller networks. The upper layer is computed on CPU, which becomes - a bottleneck on larger GPU-based models, where it's also less necessary. + Recommended values are 1, 2 or 3. nO (int or None): The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.py index 7bdb2849d..02ae63925 100644 --- a/spacy/pipeline/dep_parser.py +++ b/spacy/pipeline/dep_parser.py @@ -16,12 +16,11 @@ from ..training import validate_examples default_model_config = """ [model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "parser" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 -use_upper = true [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" @@ -62,7 +61,7 @@ def make_parser( moves: Optional[list], update_with_oracle_cut_size: int, learn_tokens: bool, - min_action_freq: int + min_action_freq: int, ): """Create a transition-based DependencyParser component. The dependency parser jointly learns sentence segmentation and labelled dependency parsing, and can @@ -114,6 +113,7 @@ def make_parser( beam_update_prob=0.0, ) + @Language.factory( "beam_parser", assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], @@ -195,7 +195,7 @@ def make_beam_parser( beam_update_prob=beam_update_prob, multitasks=[], learn_tokens=learn_tokens, - min_action_freq=min_action_freq + min_action_freq=min_action_freq, ) @@ -204,6 +204,7 @@ class DependencyParser(Parser): DOCS: https://nightly.spacy.io/api/dependencyparser """ + TransitionSystem = ArcEager @property @@ -245,16 +246,21 @@ class DependencyParser(Parser): DOCS: https://nightly.spacy.io/api/dependencyparser#score """ + def has_sents(doc): return doc.has_annotation("SENT_START") validate_examples(examples, "DependencyParser.score") + def dep_getter(token, attr): dep = getattr(token, attr) dep = token.vocab.strings.as_string(dep).lower() return dep + results = {} - results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) + results.update( + Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) + ) kwargs.setdefault("getter", dep_getter) kwargs.setdefault("ignore_labels", ("p", "punct")) results.update(Scorer.score_deps(examples, "dep", **kwargs)) diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py index cd2f9e1cf..474dec9bd 100644 --- a/spacy/pipeline/ner.py +++ b/spacy/pipeline/ner.py @@ -13,12 +13,11 @@ from ..training import validate_examples default_model_config = """ [model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "ner" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 -use_upper = true [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" @@ -41,8 +40,12 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "update_with_oracle_cut_size": 100, "model": DEFAULT_NER_MODEL, }, - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, - + default_score_weights={ + "ents_f": 1.0, + "ents_p": 0.0, + "ents_r": 0.0, + "ents_per_type": None, + }, ) def make_ner( nlp: Language, @@ -89,6 +92,7 @@ def make_ner( beam_update_prob=0.0, ) + @Language.factory( "beam_ner", assigns=["doc.ents", "token.ent_iob", "token.ent_type"], @@ -98,9 +102,14 @@ def make_ner( "model": DEFAULT_NER_MODEL, "beam_density": 0.01, "beam_update_prob": 0.5, - "beam_width": 32 + "beam_width": 32, + }, + default_score_weights={ + "ents_f": 1.0, + "ents_p": 0.0, + "ents_r": 0.0, + "ents_per_type": None, }, - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, ) def make_beam_ner( nlp: Language, @@ -165,6 +174,7 @@ class EntityRecognizer(Parser): DOCS: https://nightly.spacy.io/api/entityrecognizer """ + TransitionSystem = BiluoPushDown def add_multitask_objective(self, mt_component): @@ -184,8 +194,11 @@ class EntityRecognizer(Parser): def labels(self): # Get the labels from the model by looking at the available moves, e.g. # B-PERSON, I-PERSON, L-PERSON, U-PERSON - labels = set(move.split("-")[1] for move in self.move_names - if move[0] in ("B", "I", "L", "U")) + labels = set( + move.split("-")[1] + for move in self.move_names + if move[0] in ("B", "I", "L", "U") + ) return tuple(sorted(labels)) def score(self, examples, **kwargs): diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index b22d2deee..0ff5c5a66 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -246,7 +246,7 @@ def test_empty_ner(): def test_ruler_before_ner(): - """ Test that an NER works after an entity_ruler: the second can add annotations """ + """Test that an NER works after an entity_ruler: the second can add annotations""" nlp = English() # 1 : Entity Ruler - should set "this" to B and everything else to empty @@ -266,7 +266,7 @@ def test_ruler_before_ner(): def test_ner_before_ruler(): - """ Test that an entity_ruler works after an NER: the second can overwrite O annotations """ + """Test that an entity_ruler works after an NER: the second can overwrite O annotations""" nlp = English() # 1: untrained NER - should set everything to O @@ -287,7 +287,7 @@ def test_ner_before_ruler(): def test_block_ner(): - """ Test functionality for blocking tokens so they can't be in a named entity """ + """Test functionality for blocking tokens so they can't be in a named entity""" # block "Antti L Korhonen" from being a named entity nlp = English() nlp.add_pipe("blocker", config={"start": 2, "end": 5}) @@ -301,11 +301,10 @@ def test_block_ner(): assert [token.ent_type_ for token in doc] == expected_types -@pytest.mark.parametrize("use_upper", [True, False]) -def test_overfitting_IO(use_upper): +def test_overfitting_IO(): # Simple test to try and quickly overfit the NER component nlp = English() - ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}}) + ner = nlp.add_pipe("ner") train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) @@ -337,7 +336,6 @@ def test_overfitting_IO(use_upper): assert ents2[0].label_ == "LOC" # Ensure that the predictions are still the same, even after adding a new label ner2 = nlp2.get_pipe("ner") - assert ner2.model.attrs["has_upper"] == use_upper ner2.add_label("RANDOM_NEW_LABEL") doc3 = nlp2(test_text) ents3 = doc3.ents diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index d8f0ce022..b1f274252 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -494,18 +494,17 @@ for a Tok2Vec layer. ## Parser & NER architectures {#parser} -### spacy.TransitionBasedParser.v2 {#TransitionBasedParser source="spacy/ml/models/parser.py"} +### spacy.TransitionBasedParser.v3 {#TransitionBasedParser source="spacy/ml/models/parser.py"} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.TransitionBasedParser.v2" +> @architectures = "spacy.TransitionBasedParser.v3" > state_type = "ner" > extra_state_tokens = false > hidden_width = 64 > maxout_pieces = 2 -> use_upper = true > > [model.tok2vec] > @architectures = "spacy.HashEmbedCNN.v1" @@ -535,16 +534,15 @@ consists of either two or three subnetworks: state representation. If not present, the output from the lower model is used as action scores directly. -| Name | Description | -| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ | -| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ | -| `hidden_width` | The width of the hidden layer. ~~int~~ | -| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | -| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | -| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | +| Name | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ | +| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ | +| `hidden_width` | The width of the hidden layer. ~~int~~ | +| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~ | +| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index fdf15d187..b39bc3eb3 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -141,7 +141,7 @@ factory = "tok2vec" factory = "ner" [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v3" [components.ner.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" @@ -158,7 +158,7 @@ same. This makes them fully independent and doesn't require an upstream factory = "ner" [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v3" [components.ner.model.tok2vec] @architectures = "spacy.Tok2Vec.v2" @@ -446,7 +446,7 @@ sneakily delegates to the `Transformer` pipeline component. factory = "ner" [nlp.pipeline.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "ner" extra_state_tokens = false hidden_width = 128 From dbaf68a43964bb815389f652fe32d801372ee349 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 29 Oct 2021 14:19:30 +0200 Subject: [PATCH 54/74] formatting --- spacy/pipeline/dep_parser.py | 7 +++---- spacy/pipeline/ner.py | 13 ++++++------- spacy/tests/parser/test_parse.py | 2 +- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.py index f9d9d4840..0be6e6ccd 100644 --- a/spacy/pipeline/dep_parser.py +++ b/spacy/pipeline/dep_parser.py @@ -114,7 +114,7 @@ def make_parser( beam_update_prob=0.0, # At some point in the future we can try to implement support for # partial annotations, perhaps only in the beam objective. - incorrect_spans_key=None + incorrect_spans_key=None, ) @@ -207,7 +207,7 @@ def make_beam_parser( min_action_freq=min_action_freq, # At some point in the future we can try to implement support for # partial annotations, perhaps only in the beam objective. - incorrect_spans_key=None + incorrect_spans_key=None, ) @@ -235,8 +235,7 @@ class DependencyParser(Parser): multitasks=tuple(), incorrect_spans_key=None, ): - """Create a DependencyParser. - """ + """Create a DependencyParser.""" super().__init__( vocab, model, diff --git a/spacy/pipeline/ner.py b/spacy/pipeline/ner.py index 830f1aacd..b18889203 100644 --- a/spacy/pipeline/ner.py +++ b/spacy/pipeline/ner.py @@ -40,7 +40,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "moves": None, "update_with_oracle_cut_size": 100, "model": DEFAULT_NER_MODEL, - "incorrect_spans_key": None + "incorrect_spans_key": None, }, default_score_weights={ "ents_f": 1.0, @@ -55,7 +55,7 @@ def make_ner( model: Model, moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, - incorrect_spans_key: Optional[str]=None + incorrect_spans_key: Optional[str] = None, ): """Create a transition-based EntityRecognizer component. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -126,7 +126,7 @@ def make_beam_ner( beam_width: int, beam_density: float, beam_update_prob: float, - incorrect_spans_key: Optional[str]=None + incorrect_spans_key: Optional[str] = None, ): """Create a transition-based EntityRecognizer component that uses beam-search. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -173,7 +173,7 @@ def make_beam_ner( beam_width=beam_width, beam_density=beam_density, beam_update_prob=beam_update_prob, - incorrect_spans_key=incorrect_spans_key + incorrect_spans_key=incorrect_spans_key, ) @@ -199,15 +199,14 @@ class EntityRecognizer(Parser): multitasks=tuple(), incorrect_spans_key=None, ): - """Create an EntityRecognizer. - """ + """Create an EntityRecognizer.""" super().__init__( vocab, model, name, moves, update_with_oracle_cut_size=update_with_oracle_cut_size, - min_action_freq=1, # not relevant for NER + min_action_freq=1, # not relevant for NER learn_tokens=False, # not relevant for NER beam_width=beam_width, beam_density=beam_density, diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 574963f1f..52e81de94 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -56,7 +56,7 @@ PARTIAL_DATA = [ ), ] -PARSERS = ["parser"] # TODO: Test beam_parser when ready +PARSERS = ["parser"] # TODO: Test beam_parser when ready eps = 0.1 From 1cc0d05812c5c4874d6c4ad12b61ef92ed8ea57c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 29 Oct 2021 17:10:07 +0200 Subject: [PATCH 55/74] fixes --- setup.py | 6 +++++- spacy/pipeline/transition_parser.pyx | 2 +- spacy/tests/parser/test_add_label.py | 1 + spacy/tests/pipeline/test_tok2vec.py | 2 +- spacy/tests/test_misc.py | 2 +- spacy/tokens/_dict_proxies.py | 3 --- spacy/training/example.pyx | 1 - 7 files changed, 9 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index dcfa98cfa..1397a8d01 100755 --- a/setup.py +++ b/setup.py @@ -201,7 +201,11 @@ def setup_package(): for name in MOD_NAMES: mod_path = name.replace(".", "/") + ".pyx" ext = Extension( - name, [mod_path], language="c++", extra_compile_args=["-std=c++11"] + name, + [mod_path], + language="c++", + include_dirs=include_dirs, + extra_compile_args=["-std=c++11"], ) ext_modules.append(ext) print("Cythonizing sources") diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 945652cad..814a4d894 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -47,7 +47,7 @@ class Parser(TrainablePipe): beam_density=0.0, beam_update_prob=0.0, multitasks=tuple(), - incorrect_spans_key=None + incorrect_spans_key=None, ): """Create a Parser. diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index f89e993e9..540b00f89 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -123,6 +123,7 @@ def test_ner_labels_added_implicitly_on_predict(): assert "D" in ner.labels +@pytest.mark.skip(reason="Not yet supported") def test_ner_labels_added_implicitly_on_beam_parse(): nlp = Language() ner = nlp.add_pipe("beam_ner") diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index eeea906bb..50c4b90ce 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -255,7 +255,7 @@ cfg_string_multi = """ factory = "ner" [components.ner.model] - @architectures = "spacy.TransitionBasedParser.v2" + @architectures = "spacy.TransitionBasedParser.v3" [components.ner.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 4dd56a4a5..4ce63ede0 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -5,7 +5,7 @@ from pathlib import Path from spacy.about import __version__ as spacy_version from spacy import util from spacy import prefer_gpu, require_gpu, require_cpu -from spacy.util import dot_to_object, SimpleFrozenList +from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int from thinc.api import Config, Optimizer, ConfigValidationError from thinc.api import set_current_ops from spacy.training.batchers import minibatch_by_words diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py index 83399eafa..470d3430f 100644 --- a/spacy/tokens/_dict_proxies.py +++ b/spacy/tokens/_dict_proxies.py @@ -40,9 +40,6 @@ class SpanGroups(UserDict): doc = self._ensure_doc() return SpanGroups(doc).from_bytes(self.to_bytes()) - def copy(self) -> "SpanGroups": - return SpanGroups(self.doc_ref()).from_bytes(self.to_bytes()) - def to_bytes(self) -> bytes: # We don't need to serialize this as a dict, because the groups # know their names. diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 732203e7b..5357b5c0b 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -1,5 +1,4 @@ from collections.abc import Iterable as IterableInstance -import warnings import numpy from murmurhash.mrmr cimport hash64 From 87cf72d1c8af4d8316c5f4315fb99d9a00e9ec31 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 29 Oct 2021 17:38:11 +0200 Subject: [PATCH 56/74] pass nO through --- spacy/ml/tb_framework.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 10d263851..cd543131a 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,5 +1,5 @@ from typing import List, Tuple, Any, Optional -from thinc.api import Ops, Model, normal_init, chain, list2array, Linear +from thinc.api import Model, normal_init, chain, list2array, Linear from thinc.types import Floats1d, Floats2d, Floats3d, Ints2d, Floats4d import numpy from ..tokens.doc import Doc @@ -36,12 +36,12 @@ def TransitionModel( params={ "lower_W": None, # Floats2d W for the hidden layer "lower_b": None, # Floats1d bias for the hidden layer - "lower_pad": None, # Floats1d bias for the hidden layer + "lower_pad": None, # Floats1d padding for the hidden layer "upper_W": None, # Floats2d W for the output layer "upper_b": None, # Floats1d bias for the output layer }, dims={ - "nO": None, # Output size + "nO": nO, "nP": maxout_pieces, "nH": hidden_width, "nI": tok2vec_projected.maybe_get_dim("nO"), From dd03ad2e27751589965e31da279bd23a62831c7a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 31 Oct 2021 01:27:36 +0200 Subject: [PATCH 57/74] Fix empty doc in update --- spacy/pipeline/transition_parser.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 814a4d894..04874357f 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -291,7 +291,7 @@ class Parser(TrainablePipe): if n_examples == 0: return losses set_dropout_rate(self.model, drop) - docs = [eg.x for eg in examples] + docs = [eg.x for eg in examples if len(eg.x)] (states, scores), backprop_scores = self.model.begin_update((docs, self.moves)) if sum(s.shape[0] for s in scores) == 0: return losses @@ -343,6 +343,8 @@ class Parser(TrainablePipe): states = moves.init_batch([eg.x for eg in examples]) cdef int i = 0 for eg, state, history in zip(examples, states, histories): + if len(history) == 0: + continue gold = moves.init_gold(state, eg) for clas in history: moves.set_costs(is_valid, &c_costs[i*nO], state.c, gold) From dea702b4b7a6786dc373e16a9a50ccd9070a4c5d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 31 Oct 2021 01:28:20 +0200 Subject: [PATCH 58/74] Hackishly fix resizing. 3 failures --- spacy/ml/tb_framework.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index cd543131a..9f852c628 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -73,7 +73,12 @@ def resize_output(model: Model, new_nO: int) -> Model: model.attrs["unseen_classes"].add(i) model.set_param("upper_W", new_W) model.set_param("upper_b", new_b) - model.set_dim("nO", new_nO, force=True) + # TODO: Avoid this private intrusion + model._dims["nO"] = new_nO + if model.has_grad("upper_W"): + model.set_grad("upper_W", model.get_param("upper_W") * 0) + if model.has_grad("upper_b"): + model.set_grad("upper_b", model.get_param("upper_b") * 0) return model From 604ceb1da1b87add41a8d1adcc702c71062681a9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 31 Oct 2021 01:56:28 +0200 Subject: [PATCH 59/74] Fix redundant test. 2 failures --- spacy/tests/parser/test_ner.py | 39 ++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 587d1fff1..efc7ebc1b 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -131,6 +131,41 @@ def test_negative_sample_key_is_in_config(vocab, entity_types): assert tsys.cfg["neg_key"] == "non_entities" +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots): + entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots] + example = Example.from_dict(doc, {"entities": entity_annots}) + ex_dict = example.to_dict() + + for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]): + if tag == "L-!GPE": + ex_dict["doc_annotation"]["entities"][i] = "-" + example = Example.from_dict(doc, ex_dict) + + act_classes = tsys.get_oracle_sequence(example) + names = [tsys.get_class_name(act) for act in act_classes] + assert names + + +def test_get_oracle_moves_negative_entities2(tsys, vocab): + doc = Doc(vocab, words=["A", "B", "C", "D"]) + entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"] + example = Example.from_dict(doc, {"entities": entity_annots}) + act_classes = tsys.get_oracle_sequence(example) + names = [tsys.get_class_name(act) for act in act_classes] + assert names + + +@pytest.mark.skip(reason="Maybe outdated? Unsure") +def test_get_oracle_moves_negative_O(tsys, vocab): + doc = Doc(vocab, words=["A", "B", "C", "D"]) + entity_annots = ["O", "!O", "O", "!O"] + example = Example.from_dict(doc, {"entities": entity_annots}) + act_classes = tsys.get_oracle_sequence(example) + names = [tsys.get_class_name(act) for act in act_classes] + assert names + + # We can't easily represent this on a Doc object. Not sure what the best solution # would be, but I don't think it's an important use case? @pytest.mark.skip(reason="No longer supported") @@ -242,7 +277,7 @@ def test_train_empty(): train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) ner = nlp.add_pipe("ner", last=True) ner.add_label("PERSON") - nlp.initialize() + nlp.initialize(get_examples=lambda: train_examples) for itn in range(2): losses = {} batches = util.minibatch(train_examples, size=8) @@ -372,7 +407,7 @@ def test_block_ner(): def test_overfitting_IO(): # Simple test to try and quickly overfit the NER component nlp = English() - ner = nlp.add_pipe("ner") + ner = nlp.add_pipe("ner", config={"model": {}}) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) From 5903138ab64a2d604d102a46c4dfe00e9f29e877 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 31 Oct 2021 13:30:28 +0100 Subject: [PATCH 60/74] Add reference version --- spacy/ml/tb_framework.py | 138 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 3 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 9f852c628..589505cd5 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,5 +1,6 @@ from typing import List, Tuple, Any, Optional -from thinc.api import Model, normal_init, chain, list2array, Linear +from thinc.api import Ops, Model, normal_init, chain, list2array, Linear +from thinc.api import uniform_init from thinc.types import Floats1d, Floats2d, Floats3d, Ints2d, Floats4d import numpy from ..tokens.doc import Doc @@ -29,7 +30,7 @@ def TransitionModel( return Model( name="parser_model", - forward=forward, + forward=_forward_reference, init=init, layers=[tok2vec_projected], refs={"tok2vec": tok2vec_projected}, @@ -41,7 +42,7 @@ def TransitionModel( "upper_b": None, # Floats1d bias for the output layer }, dims={ - "nO": nO, + "nO": None, # Output size "nP": maxout_pieces, "nH": hidden_width, "nI": tok2vec_projected.maybe_get_dim("nO"), @@ -186,6 +187,137 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo d_statevecs = model.ops.gemm(d_scores, upper_W) # Backprop through the maxout activation d_preacts = model.ops.backprop_maxout(d_statevecs, which, model.get_dim("nP")) + d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], -1) + model.inc_grad("lower_b", d_preacts2f.sum(axis=0)) + model.inc_grad("lower_W", model.ops.gemm(d_preacts2f, tokfeats, trans1=True)) + d_tokfeats = model.ops.gemm(d_preacts2f, lower_W) + d_tokfeats3f = model.ops.reshape3f(d_tokfeats, nS, nF, nI) + d_lower_pad = model.ops.alloc2f(nF, nI) + for i in range(ids.shape[0]): + for j in range(ids.shape[1]): + if ids[i, j] == -1: + d_lower_pad[j] += d_tokfeats3f[i, j] + else: + d_tokvecs[ids[i, j]] += d_tokfeats3f[i, j] + model.inc_grad("lower_pad", d_lower_pad) + # We don't need to backprop the summation, because we pass back the IDs instead + # d_state_features = backprop_feats((d_preacts, all_ids)) + # ids1d = model.ops.xp.vstack(all_ids).flatten() + # d_state_features = d_state_features.reshape((ids1d.size, -1)) + # d_tokvecs = model.ops.alloc((tokvecs.shape[0] + 1, tokvecs.shape[1])) + # model.ops.scatter_add(d_tokvecs, ids1d, d_state_features) + return (backprop_tok2vec(d_tokvecs), None) + + return (states, all_scores), backprop_parser + + + +def _forward_reference(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool): + """Slow reference implementation, without the precomputation""" + nF = model.get_dim("nF") + tok2vec = model.get_ref("tok2vec") + lower_pad = model.get_param("lower_pad") + lower_W = model.get_param("lower_W") + lower_b = model.get_param("lower_b") + upper_W = model.get_param("upper_W") + upper_b = model.get_param("upper_b") + nH = model.get_dim("nH") + nP = model.get_dim("nP") + nO = model.get_dim("nO") + nI = model.get_dim("nI") + + ops = model.ops + docs, moves = docs_moves + states = moves.init_batch(docs) + tokvecs, backprop_tok2vec = tok2vec(docs, is_train) + all_ids = [] + all_which = [] + all_statevecs = [] + all_scores = [] + all_tokfeats = [] + next_states = [s for s in states if not s.is_final()] + unseen_mask = _get_unseen_mask(model) + assert unseen_mask.all() # TODO unhack + ids = numpy.zeros((len(states), nF), dtype="i") + while next_states: + ids = ids[: len(next_states)] + for i, state in enumerate(next_states): + state.set_context_tokens(ids, i, nF) + # Sum the state features, add the bias and apply the activation (maxout) + # to create the state vectors. + tokfeats3f = model.ops.alloc3f(ids.shape[0], nF, nI) + for i in range(ids.shape[0]): + for j in range(nF): + if ids[i, j] == -1: + tokfeats3f[i, j] = lower_pad + else: + tokfeats3f[i, j] = tokvecs[ids[i, j]] + tokfeats = model.ops.reshape2f(tokfeats3f, tokfeats3f.shape[0], -1) + preacts2f = model.ops.gemm(tokfeats, lower_W, trans2=True) + preacts2f += lower_b + preacts = model.ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP) + statevecs, which = ops.maxout(preacts) + # Multiply the state-vector by the scores weights and add the bias, + # to get the logits. + scores = model.ops.gemm(statevecs, upper_W, trans2=True) + scores += upper_b + scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores) + # Transition the states, filtering out any that are finished. + next_states = moves.transition_states(next_states, scores) + all_scores.append(scores) + if is_train: + # Remember intermediate results for the backprop. + all_tokfeats.append(tokfeats) + all_ids.append(ids.copy()) + all_statevecs.append(statevecs) + all_which.append(which) + + nS = sum(len(s.history) for s in states) + + def backprop_parser(d_states_d_scores): + d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1]) + ids = model.ops.xp.vstack(all_ids) + which = ops.xp.vstack(all_which) + statevecs = model.ops.xp.vstack(all_statevecs) + tokfeats = model.ops.xp.vstack(all_tokfeats) + _, d_scores = d_states_d_scores + if model.attrs.get("unseen_classes"): + # If we have a negative gradient (i.e. the probability should + # increase) on any classes we filtered out as unseen, mark + # them as seen. + for clas in set(model.attrs["unseen_classes"]): + if (d_scores[:, clas] < 0).any(): + model.attrs["unseen_classes"].remove(clas) + d_scores *= unseen_mask + assert statevecs.shape == (nS, nH), statevecs.shape + assert d_scores.shape == (nS, nO), d_scores.shape + # Calculate the gradients for the parameters of the upper layer. + # The weight gemm is (nS, nO) @ (nS, nH).T + model.inc_grad("upper_b", d_scores.sum(axis=0)) + model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True)) + # Now calculate d_statevecs, by backproping through the upper linear layer. + # This gemm is (nS, nO) @ (nO, nH) + d_statevecs = model.ops.gemm(d_scores, upper_W) + # Backprop through the maxout activation + d_preacts = model.ops.backprop_maxout(d_statevecs, which, nP) + d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH*nP) + # Now increment the gradients for the lower layer. + # The gemm here is (nS, nH*nP) @ (nS, nF*nI) + model.inc_grad("lower_b", d_preacts2f.sum(axis=0)) + model.inc_grad("lower_W", model.ops.gemm(d_preacts2f, tokfeats, trans1=True)) + # Caclulate d_tokfeats + # The gemm here is (nS, nH*nP) @ (nH*nP, nF*nI) + d_tokfeats = model.ops.gemm(d_preacts2f, lower_W) + # Get the gradients of the tokvecs and the padding + d_tokfeats3f = model.ops.reshape3f(d_tokfeats, nS, nF, nI) + d_lower_pad = model.ops.alloc1f(nI) + for i in range(ids.shape[0]): + for j in range(ids.shape[1]): + if ids[i, j] == -1: + d_lower_pad += d_tokfeats3f[i, j] + else: + d_tokvecs[ids[i, j]] += d_tokfeats3f[i, j] + model.inc_grad("lower_pad", d_lower_pad) # We don't need to backprop the summation, because we pass back the IDs instead d_state_features = backprop_feats((d_preacts, all_ids)) ids1d = model.ops.xp.vstack(all_ids).flatten() From f8672c4dc2326dd6339a07970c1f00313c89bb17 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 20 Jan 2022 16:09:54 +0100 Subject: [PATCH 61/74] black formatting --- spacy/ml/tb_framework.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 589505cd5..2321b34a3 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -211,8 +211,9 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo return (states, all_scores), backprop_parser - -def _forward_reference(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool): +def _forward_reference( + model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool +): """Slow reference implementation, without the precomputation""" nF = model.get_dim("nF") tok2vec = model.get_ref("tok2vec") @@ -300,7 +301,7 @@ def _forward_reference(model, docs_moves: Tuple[List[Doc], TransitionSystem], is d_statevecs = model.ops.gemm(d_scores, upper_W) # Backprop through the maxout activation d_preacts = model.ops.backprop_maxout(d_statevecs, which, nP) - d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH*nP) + d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP) # Now increment the gradients for the lower layer. # The gemm here is (nS, nH*nP) @ (nS, nF*nI) model.inc_grad("lower_b", d_preacts2f.sum(axis=0)) From 337b3f22b8d77097f460245e1716f2b453c210f5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 31 Oct 2021 17:04:16 +0100 Subject: [PATCH 62/74] Get tests passing with reference implementation --- spacy/ml/tb_framework.py | 289 ++++++++++++++++----------- spacy/pipeline/transition_parser.pyx | 32 +-- spacy/tests/parser/test_ner.py | 4 +- spacy/tests/parser/test_parse.py | 3 + 4 files changed, 199 insertions(+), 129 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 2321b34a3..4d0d3283b 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,6 +1,6 @@ from typing import List, Tuple, Any, Optional from thinc.api import Ops, Model, normal_init, chain, list2array, Linear -from thinc.api import uniform_init +from thinc.api import uniform_init, glorot_uniform_init, zero_init from thinc.types import Floats1d, Floats2d, Floats3d, Ints2d, Floats4d import numpy from ..tokens.doc import Doc @@ -107,114 +107,26 @@ def init( nF = model.get_dim("nF") ops = model.ops - Wl = ops.alloc4f(nF, nH, nP, nI) - bl = ops.alloc2f(nH, nP) - padl = ops.alloc4f(1, nF, nH, nP) + Wl = ops.alloc2f(nH * nP, nF * nI) + bl = ops.alloc1f(nH * nP) + padl = ops.alloc1f(nI) Wu = ops.alloc2f(nO, nH) bu = ops.alloc1f(nO) - Wl = normal_init(ops, Wl.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI))) # type: ignore - padl = normal_init(ops, padl.shape, mean=1.0) # type: ignore + Wu = zero_init(ops, Wu.shape) + #Wl = zero_init(ops, Wl.shape) + Wl = glorot_uniform_init(ops, Wl.shape) + padl = uniform_init(ops, padl.shape) # type: ignore # TODO: Experiment with whether better to initialize upper_W model.set_param("lower_W", Wl) model.set_param("lower_b", bl) model.set_param("lower_pad", padl) model.set_param("upper_W", Wu) model.set_param("upper_b", bu) - - _lsuv_init(model) + # model = _lsuv_init(model) + return model def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool): - nF = model.get_dim("nF") - tok2vec = model.get_ref("tok2vec") - lower_pad = model.get_param("lower_pad") - lower_b = model.get_param("lower_b") - upper_W = model.get_param("upper_W") - upper_b = model.get_param("upper_b") - - ops = model.ops - docs, moves = docs_moves - states = moves.init_batch(docs) - tokvecs, backprop_tok2vec = tok2vec(docs, is_train) - feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train) - all_ids = [] - all_which = [] - all_statevecs = [] - all_scores = [] - next_states = [s for s in states if not s.is_final()] - unseen_mask = _get_unseen_mask(model) - ids = numpy.zeros((len(states), nF), dtype="i") - arange = model.ops.xp.arange(nF) - while next_states: - ids = ids[: len(next_states)] - for i, state in enumerate(next_states): - state.set_context_tokens(ids, i, nF) - # Sum the state features, add the bias and apply the activation (maxout) - # to create the state vectors. - preacts = feats[ids, arange].sum(axis=1) # type: ignore - preacts += lower_b - statevecs, which = ops.maxout(preacts) - # Multiply the state-vector by the scores weights and add the bias, - # to get the logits. - scores = ops.gemm(statevecs, upper_W, trans2=True) - scores += upper_b - scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores) - # Transition the states, filtering out any that are finished. - next_states = moves.transition_states(next_states, scores) - all_scores.append(scores) - if is_train: - # Remember intermediate results for the backprop. - all_ids.append(ids.copy()) - all_statevecs.append(statevecs) - all_which.append(which) - - def backprop_parser(d_states_d_scores): - _, d_scores = d_states_d_scores - if model.attrs.get("unseen_classes"): - # If we have a negative gradient (i.e. the probability should - # increase) on any classes we filtered out as unseen, mark - # them as seen. - for clas in set(model.attrs["unseen_classes"]): - if (d_scores[:, clas] < 0).any(): - model.attrs["unseen_classes"].remove(clas) - d_scores *= unseen_mask - statevecs = ops.xp.vstack(all_statevecs) - which = ops.xp.vstack(all_which) - # Calculate the gradients for the parameters of the upper layer. - model.inc_grad("upper_b", d_scores.sum(axis=0)) - model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True)) - # Now calculate d_statevecs, by backproping through the upper linear layer. - d_statevecs = model.ops.gemm(d_scores, upper_W) - # Backprop through the maxout activation - d_preacts = model.ops.backprop_maxout(d_statevecs, which, model.get_dim("nP")) - d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], -1) - model.inc_grad("lower_b", d_preacts2f.sum(axis=0)) - model.inc_grad("lower_W", model.ops.gemm(d_preacts2f, tokfeats, trans1=True)) - d_tokfeats = model.ops.gemm(d_preacts2f, lower_W) - d_tokfeats3f = model.ops.reshape3f(d_tokfeats, nS, nF, nI) - d_lower_pad = model.ops.alloc2f(nF, nI) - for i in range(ids.shape[0]): - for j in range(ids.shape[1]): - if ids[i, j] == -1: - d_lower_pad[j] += d_tokfeats3f[i, j] - else: - d_tokvecs[ids[i, j]] += d_tokfeats3f[i, j] - model.inc_grad("lower_pad", d_lower_pad) - # We don't need to backprop the summation, because we pass back the IDs instead - # d_state_features = backprop_feats((d_preacts, all_ids)) - # ids1d = model.ops.xp.vstack(all_ids).flatten() - # d_state_features = d_state_features.reshape((ids1d.size, -1)) - # d_tokvecs = model.ops.alloc((tokvecs.shape[0] + 1, tokvecs.shape[1])) - # model.ops.scatter_add(d_tokvecs, ids1d, d_state_features) - return (backprop_tok2vec(d_tokvecs), None) - - return (states, all_scores), backprop_parser - - -def _forward_reference( - model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool -): - """Slow reference implementation, without the precomputation""" nF = model.get_dim("nF") tok2vec = model.get_ref("tok2vec") lower_pad = model.get_param("lower_pad") @@ -231,6 +143,103 @@ def _forward_reference( docs, moves = docs_moves states = moves.init_batch(docs) tokvecs, backprop_tok2vec = tok2vec(docs, is_train) + feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train) + all_ids = [] + all_which = [] + all_statevecs = [] + all_scores = [] + all_tokfeats = [] + next_states = [s for s in states if not s.is_final()] + unseen_mask = _get_unseen_mask(model) + ids = numpy.zeros((len(states), nF), dtype="i") + arange = model.ops.xp.arange(nF) + while next_states: + ids = ids[: len(next_states)] + for i, state in enumerate(next_states): + state.set_context_tokens(ids, i, nF) + preacts = feats[ids, arange].sum(axis=1) # type: ignore + statevecs, which = ops.maxout(preacts) + # Multiply the state-vector by the scores weights and add the bias, + # to get the logits. + scores = ops.gemm(statevecs, upper_W, trans2=True) + scores += upper_b + scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores) + # Transition the states, filtering out any that are finished. + next_states = moves.transition_states(next_states, scores) + all_scores.append(scores) + if is_train: + # Remember intermediate results for the backprop. + all_tokfeats.append(tokfeats) + all_ids.append(ids.copy()) + all_statevecs.append(statevecs) + all_which.append(which) + + nS = sum(len(s.history) for s in states) + + def backprop_parser(d_states_d_scores): + d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1]) + ids = model.ops.xp.vstack(all_ids) + which = ops.xp.vstack(all_which) + _, d_scores = d_states_d_scores + if model.attrs.get("unseen_classes"): + # If we have a negative gradient (i.e. the probability should + # increase) on any classes we filtered out as unseen, mark + # them as seen. + for clas in set(model.attrs["unseen_classes"]): + if (d_scores[:, clas] < 0).any(): + model.attrs["unseen_classes"].remove(clas) + d_scores *= unseen_mask + statevecs = ops.xp.vstack(all_statevecs) + tokfeats = ops.xp.vstack(all_tokfeats) + assert statevecs.shape == (nS, nH), statevecs.shape + assert d_scores.shape == (nS, nO), d_scores.shape + # Calculate the gradients for the parameters of the upper layer. + model.inc_grad("upper_b", d_scores.sum(axis=0)) + model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True)) + # Now calculate d_statevecs, by backproping through the upper linear layer. + d_statevecs = model.ops.gemm(d_scores, upper_W) + # Backprop through the maxout activation + d_preacts = model.ops.backprop_maxout(d_statevecs, which, model.get_dim("nP")) + model.inc_grad("lower_b", d_preacts.sum(axis=0)) + model.inc_grad("lower_W", model.ops.gemm(d_preacts, tokfeats, trans1=True)) + # We don't need to backprop the summation, because we pass back the IDs instead + d_state_features = backprop_feats((d_preacts, all_ids)) + ids1d = model.ops.xp.vstack(all_ids).flatten() + d_state_features = d_state_features.reshape((ids1d.size, -1)) + d_tokvecs = model.ops.alloc((tokvecs.shape[0] + 1, tokvecs.shape[1])) + model.ops.scatter_add(d_tokvecs, ids1d, d_state_features) + return (backprop_tok2vec(d_tokvecs), None) + + return (states, all_scores), backprop_parser + + +def _forward_reference( + model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool +): + """Slow reference implementation, without the precomputation""" + def debug_predict(*msg): + if not is_train: + pass + #print(*msg) + nF = model.get_dim("nF") + tok2vec = model.get_ref("tok2vec") + lower_pad = model.get_param("lower_pad") + lower_W = model.get_param("lower_W") + lower_b = model.get_param("lower_b") + upper_W = model.get_param("upper_W") + upper_b = model.get_param("upper_b") + nH = model.get_dim("nH") + nP = model.get_dim("nP") + nO = model.get_dim("nO") + nI = model.get_dim("nI") + + ops = model.ops + docs, moves = docs_moves + states = moves.init_batch(docs) + tokvecs, backprop_tok2vec = tok2vec(docs, is_train) + debug_predict("Tokvecs shape", tokvecs.shape) + debug_predict("Tokvecs mean", tokvecs.mean(axis=1)) + debug_predict("Tokvecs var", tokvecs.var(axis=1)) all_ids = [] all_which = [] all_statevecs = [] @@ -238,12 +247,12 @@ def _forward_reference( all_tokfeats = [] next_states = [s for s in states if not s.is_final()] unseen_mask = _get_unseen_mask(model) - assert unseen_mask.all() # TODO unhack ids = numpy.zeros((len(states), nF), dtype="i") while next_states: ids = ids[: len(next_states)] for i, state in enumerate(next_states): state.set_context_tokens(ids, i, nF) + debug_predict(ids) # Sum the state features, add the bias and apply the activation (maxout) # to create the state vectors. tokfeats3f = model.ops.alloc3f(ids.shape[0], nF, nI) @@ -251,8 +260,10 @@ def _forward_reference( for j in range(nF): if ids[i, j] == -1: tokfeats3f[i, j] = lower_pad + debug_predict("Setting tokfeat", i, j, "to pad") else: tokfeats3f[i, j] = tokvecs[ids[i, j]] + debug_predict("Setting tokfeat", i, j, "to", ids[i, j]) tokfeats = model.ops.reshape2f(tokfeats3f, tokfeats3f.shape[0], -1) preacts2f = model.ops.gemm(tokfeats, lower_W, trans2=True) preacts2f += lower_b @@ -312,6 +323,7 @@ def _forward_reference( # Get the gradients of the tokvecs and the padding d_tokfeats3f = model.ops.reshape3f(d_tokfeats, nS, nF, nI) d_lower_pad = model.ops.alloc1f(nI) + assert ids.shape[0] == nS for i in range(ids.shape[0]): for j in range(ids.shape[1]): if ids[i, j] == -1: @@ -319,17 +331,12 @@ def _forward_reference( else: d_tokvecs[ids[i, j]] += d_tokfeats3f[i, j] model.inc_grad("lower_pad", d_lower_pad) - # We don't need to backprop the summation, because we pass back the IDs instead - d_state_features = backprop_feats((d_preacts, all_ids)) - ids1d = model.ops.xp.vstack(all_ids).flatten() - d_state_features = d_state_features.reshape((ids1d.size, -1)) - d_tokvecs = model.ops.alloc((tokvecs.shape[0] + 1, tokvecs.shape[1])) - model.ops.scatter_add(d_tokvecs, ids1d, d_state_features) - return (backprop_tok2vec(d_tokvecs[:-1]), None) + return (backprop_tok2vec(d_tokvecs), None) return (states, all_scores), backprop_parser + def _get_unseen_mask(model: Model) -> Floats1d: mask = model.ops.alloc1f(model.get_dim("nO")) mask.fill(1) @@ -370,10 +377,10 @@ def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): assert dY.shape[1] == nH, dY.shape assert dY.shape[2] == nP, dY.shape # nB = dY.shape[0] - model.inc_grad( - "lower_pad", _backprop_precomputable_affine_padding(model, dY, ids) - ) - model.inc_grad("lower_b", dY.sum(axis=0)) # type: ignore + # model.inc_grad( + # "lower_pad", _backprop_precomputable_affine_padding(model, dY, ids) + # ) + # model.inc_grad("lower_b", dY.sum(axis=0)) # type: ignore dY = model.ops.reshape2f(dY, dY.shape[0], nH * nP) Wopfi = W.transpose((1, 2, 0, 3)) Wopfi = Wopfi.reshape((nH * nP, nF * nI)) @@ -384,7 +391,7 @@ def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): dWopfi = dWopfi.reshape((nH, nP, nF, nI)) # (o, p, f, i) --> (f, o, p, i) dWopfi = dWopfi.transpose((2, 0, 1, 3)) - model.inc_grad("W", dWopfi) + model.inc_grad("lower_W", dWopfi) return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI) return Yf, backward @@ -425,7 +432,7 @@ def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]: return scores[0].shape[1] -def _lsuv_init(model): +def _lsuv_init(model: Model): """This is like the 'layer sequential unit variance', but instead of taking the actual inputs, we randomly generate whitened data. @@ -434,5 +441,59 @@ def _lsuv_init(model): we set the maxout weights to values that empirically result in whitened outputs given whitened inputs. """ - # TODO - return None + W = model.maybe_get_param("lower_W") + if W is not None and W.any(): + return + + nF = model.get_dim("nF") + nH = model.get_dim("nH") + nP = model.get_dim("nP") + nI = model.get_dim("nI") + W = model.ops.alloc4f(nF, nH, nP, nI) + b = model.ops.alloc2f(nH, nP) + pad = model.ops.alloc4f(1, nF, nH, nP) + + ops = model.ops + W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI))) + pad = normal_init(ops, pad.shape, mean=1.0) + model.set_param("W", W) + model.set_param("b", b) + model.set_param("pad", pad) + + ids = ops.alloc((5000, nF), dtype="f") + ids += ops.xp.random.uniform(0, 1000, ids.shape) + ids = ops.asarray(ids, dtype="i") + tokvecs = ops.alloc((5000, nI), dtype="f") + tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape( + tokvecs.shape + ) + + def predict(ids, tokvecs): + # nS ids. nW tokvecs. Exclude the padding array. + hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False) + vectors = model.ops.alloc2f(ids.shape[0], nH * nP) + # need nS vectors + hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP)) + model.ops.scatter_add(vectors, ids.flatten(), hiddens) + vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP) + vectors3f += b + return model.ops.maxout(vectors3f)[0] + + tol_var = 0.01 + tol_mean = 0.01 + t_max = 10 + W = model.get_param("lower_W").copy() + b = model.get_param("lower_b").copy() + for t_i in range(t_max): + acts1 = predict(ids, tokvecs) + var = model.ops.xp.var(acts1) + mean = model.ops.xp.mean(acts1) + if abs(var - 1.0) >= tol_var: + W /= model.ops.xp.sqrt(var) + model.set_param("lower_W", W) + elif abs(mean) >= tol_mean: + b -= mean + model.set_param("lower_b", b) + else: + break + return model diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 04874357f..108d20da8 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -316,7 +316,7 @@ class Parser(TrainablePipe): xp = get_array_module(scores) best_costs = costs.min(axis=1, keepdims=True) gscores = scores.copy() - min_score = scores.min() + min_score = scores.min() - 1000 assert costs.shape == scores.shape, (costs.shape, scores.shape) gscores[costs > best_costs] = min_score max_ = scores.max(axis=1, keepdims=True) @@ -336,25 +336,29 @@ class Parser(TrainablePipe): cdef int nF = self.model.get_dim("nF") cdef int nO = moves.n_moves cdef int nS = sum([len(history) for history in histories]) - cdef np.ndarray costs = numpy.zeros((nS, nO), dtype="f") cdef Pool mem = Pool() is_valid = mem.alloc(nO, sizeof(int)) - c_costs = costs.data + c_costs = mem.alloc(nO, sizeof(float)) states = moves.init_batch([eg.x for eg in examples]) - cdef int i = 0 - for eg, state, history in zip(examples, states, histories): - if len(history) == 0: - continue - gold = moves.init_gold(state, eg) - for clas in history: - moves.set_costs(is_valid, &c_costs[i*nO], state.c, gold) + batch = [] + for eg, s, h in zip(examples, states, histories): + if not s.is_final(): + gold = moves.init_gold(s, eg) + batch.append((eg, s, h, gold)) + output = [] + while batch: + costs = numpy.zeros((len(batch), nO), dtype="f") + for i, (eg, state, history, gold) in enumerate(batch): + clas = history.pop(0) + moves.set_costs(is_valid, c_costs, state.c, gold) action = moves.c[clas] action.do(state.c, action.label) state.c.history.push_back(clas) - i += 1 - # If the model is on GPU, copy the costs to device. - costs = self.model.ops.asarray(costs) - return costs + for j in range(nO): + costs[i, j] = c_costs[j] + output.append(costs) + batch = [(eg, s, h, g) for eg, s, h, g in batch if len(h) != 0] + return self.model.ops.xp.vstack(output) def rehearse(self, examples, sgd=None, losses=None, **cfg): """Perform a "rehearsal" update, to prevent catastrophic forgetting.""" diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index efc7ebc1b..5213d4d11 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -10,6 +10,7 @@ from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.training import Example from spacy.tokens import Doc, Span from spacy.vocab import Vocab +from thinc.api import fix_random_seed import logging from ..util import make_tempdir @@ -405,6 +406,7 @@ def test_block_ner(): def test_overfitting_IO(): + fix_random_seed(1) # Simple test to try and quickly overfit the NER component nlp = English() ner = nlp.add_pipe("ner", config={"model": {}}) @@ -418,7 +420,7 @@ def test_overfitting_IO(): for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) - assert losses["ner"] < 0.00001 + assert losses["ner"] < 0.001 # test the trained model test_text = "I like London." diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 52e81de94..65c11620e 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -6,6 +6,7 @@ from spacy.lang.en import English from spacy.training import Example from spacy.tokens import Doc from spacy import util, registry +from thinc.api import fix_random_seed from ..util import apply_transition_sequence, make_tempdir from ...pipeline import DependencyParser @@ -258,6 +259,7 @@ def test_incomplete_data(pipe_name): @pytest.mark.parametrize("pipe_name", PARSERS) def test_overfitting_IO(pipe_name): + fix_random_seed(0) # Simple test to try and quickly overfit the dependency parser (normal or beam) nlp = English() parser = nlp.add_pipe(pipe_name) @@ -266,6 +268,7 @@ def test_overfitting_IO(pipe_name): train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for dep in annotations.get("deps", []): parser.add_label(dep) + #train_examples = train_examples[:1] optimizer = nlp.initialize() # run overfitting for i in range(200): From c45e5ac5b70cf02ef3e65d0e020c249d81c0c365 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 31 Oct 2021 17:06:10 +0100 Subject: [PATCH 63/74] Fix missing prints --- spacy/ml/tb_framework.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 4d0d3283b..fba35fbfd 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -217,10 +217,6 @@ def _forward_reference( model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool ): """Slow reference implementation, without the precomputation""" - def debug_predict(*msg): - if not is_train: - pass - #print(*msg) nF = model.get_dim("nF") tok2vec = model.get_ref("tok2vec") lower_pad = model.get_param("lower_pad") @@ -237,9 +233,6 @@ def _forward_reference( docs, moves = docs_moves states = moves.init_batch(docs) tokvecs, backprop_tok2vec = tok2vec(docs, is_train) - debug_predict("Tokvecs shape", tokvecs.shape) - debug_predict("Tokvecs mean", tokvecs.mean(axis=1)) - debug_predict("Tokvecs var", tokvecs.var(axis=1)) all_ids = [] all_which = [] all_statevecs = [] @@ -252,7 +245,6 @@ def _forward_reference( ids = ids[: len(next_states)] for i, state in enumerate(next_states): state.set_context_tokens(ids, i, nF) - debug_predict(ids) # Sum the state features, add the bias and apply the activation (maxout) # to create the state vectors. tokfeats3f = model.ops.alloc3f(ids.shape[0], nF, nI) @@ -260,10 +252,8 @@ def _forward_reference( for j in range(nF): if ids[i, j] == -1: tokfeats3f[i, j] = lower_pad - debug_predict("Setting tokfeat", i, j, "to pad") else: tokfeats3f[i, j] = tokvecs[ids[i, j]] - debug_predict("Setting tokfeat", i, j, "to", ids[i, j]) tokfeats = model.ops.reshape2f(tokfeats3f, tokfeats3f.shape[0], -1) preacts2f = model.ops.gemm(tokfeats, lower_W, trans2=True) preacts2f += lower_b From 0cdbcd8b9a39f6f5a8af9be7f61dd629fdb668b1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 31 Oct 2021 17:07:32 +0100 Subject: [PATCH 64/74] Add missing file --- spacy/ml/_precomputable_affine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index e69de29bb..ada04b26a 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -0,0 +1,2 @@ +class PrecomputableAffine: + pass From 160dbc58eae17ed8ecd25fb498519f664a3241ac Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 1 Nov 2021 00:23:15 +0100 Subject: [PATCH 65/74] Improve indexing on reference implementation --- spacy/ml/tb_framework.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index fba35fbfd..55eaefec9 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -233,6 +233,7 @@ def _forward_reference( docs, moves = docs_moves states = moves.init_batch(docs) tokvecs, backprop_tok2vec = tok2vec(docs, is_train) + tokvecs = model.ops.xp.vstack((tokvecs, lower_pad)) all_ids = [] all_which = [] all_statevecs = [] @@ -247,13 +248,7 @@ def _forward_reference( state.set_context_tokens(ids, i, nF) # Sum the state features, add the bias and apply the activation (maxout) # to create the state vectors. - tokfeats3f = model.ops.alloc3f(ids.shape[0], nF, nI) - for i in range(ids.shape[0]): - for j in range(nF): - if ids[i, j] == -1: - tokfeats3f[i, j] = lower_pad - else: - tokfeats3f[i, j] = tokvecs[ids[i, j]] + tokfeats3f = tokvecs[ids] tokfeats = model.ops.reshape2f(tokfeats3f, tokfeats3f.shape[0], -1) preacts2f = model.ops.gemm(tokfeats, lower_W, trans2=True) preacts2f += lower_b @@ -312,16 +307,9 @@ def _forward_reference( d_tokfeats = model.ops.gemm(d_preacts2f, lower_W) # Get the gradients of the tokvecs and the padding d_tokfeats3f = model.ops.reshape3f(d_tokfeats, nS, nF, nI) - d_lower_pad = model.ops.alloc1f(nI) - assert ids.shape[0] == nS - for i in range(ids.shape[0]): - for j in range(ids.shape[1]): - if ids[i, j] == -1: - d_lower_pad += d_tokfeats3f[i, j] - else: - d_tokvecs[ids[i, j]] += d_tokfeats3f[i, j] - model.inc_grad("lower_pad", d_lower_pad) - return (backprop_tok2vec(d_tokvecs), None) + model.ops.scatter_add(d_tokvecs, ids, d_tokfeats3f) + model.inc_grad("lower_pad", d_tokvecs[-1]) + return (backprop_tok2vec(d_tokvecs[:-1]), None) return (states, all_scores), backprop_parser From 07603a26ae1027cb9792a6b81e2101fe6203db68 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 1 Nov 2021 01:32:29 +0100 Subject: [PATCH 66/74] Get non-reference forward func working --- spacy/ml/tb_framework.py | 103 ++++++++++++--------------------------- 1 file changed, 32 insertions(+), 71 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 55eaefec9..753c99cb9 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -30,7 +30,7 @@ def TransitionModel( return Model( name="parser_model", - forward=_forward_reference, + forward=forward, init=init, layers=[tok2vec_projected], refs={"tok2vec": tok2vec_projected}, @@ -113,7 +113,7 @@ def init( Wu = ops.alloc2f(nO, nH) bu = ops.alloc1f(nO) Wu = zero_init(ops, Wu.shape) - #Wl = zero_init(ops, Wl.shape) + # Wl = zero_init(ops, Wl.shape) Wl = glorot_uniform_init(ops, Wl.shape) padl = uniform_init(ops, padl.shape) # type: ignore # TODO: Experiment with whether better to initialize upper_W @@ -143,12 +143,12 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo docs, moves = docs_moves states = moves.init_batch(docs) tokvecs, backprop_tok2vec = tok2vec(docs, is_train) + tokvecs = model.ops.xp.vstack((tokvecs, lower_pad)) feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train) all_ids = [] all_which = [] all_statevecs = [] all_scores = [] - all_tokfeats = [] next_states = [s for s in states if not s.is_final()] unseen_mask = _get_unseen_mask(model) ids = numpy.zeros((len(states), nF), dtype="i") @@ -157,11 +157,16 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo ids = ids[: len(next_states)] for i, state in enumerate(next_states): state.set_context_tokens(ids, i, nF) - preacts = feats[ids, arange].sum(axis=1) # type: ignore + # Sum the state features, add the bias and apply the activation (maxout) + # to create the state vectors. + preacts2f = feats[ids, arange].sum(axis=1) # type: ignore + preacts2f += lower_b + preacts = model.ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP) + assert preacts.shape[0] == len(next_states), preacts.shape statevecs, which = ops.maxout(preacts) # Multiply the state-vector by the scores weights and add the bias, # to get the logits. - scores = ops.gemm(statevecs, upper_W, trans2=True) + scores = model.ops.gemm(statevecs, upper_W, trans2=True) scores += upper_b scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores) # Transition the states, filtering out any that are finished. @@ -169,17 +174,15 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo all_scores.append(scores) if is_train: # Remember intermediate results for the backprop. - all_tokfeats.append(tokfeats) all_ids.append(ids.copy()) all_statevecs.append(statevecs) all_which.append(which) - nS = sum(len(s.history) for s in states) - def backprop_parser(d_states_d_scores): d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1]) ids = model.ops.xp.vstack(all_ids) which = ops.xp.vstack(all_which) + statevecs = model.ops.xp.vstack(all_statevecs) _, d_scores = d_states_d_scores if model.attrs.get("unseen_classes"): # If we have a negative gradient (i.e. the probability should @@ -189,26 +192,23 @@ def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: boo if (d_scores[:, clas] < 0).any(): model.attrs["unseen_classes"].remove(clas) d_scores *= unseen_mask - statevecs = ops.xp.vstack(all_statevecs) - tokfeats = ops.xp.vstack(all_tokfeats) - assert statevecs.shape == (nS, nH), statevecs.shape - assert d_scores.shape == (nS, nO), d_scores.shape # Calculate the gradients for the parameters of the upper layer. + # The weight gemm is (nS, nO) @ (nS, nH).T model.inc_grad("upper_b", d_scores.sum(axis=0)) model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True)) # Now calculate d_statevecs, by backproping through the upper linear layer. + # This gemm is (nS, nO) @ (nO, nH) d_statevecs = model.ops.gemm(d_scores, upper_W) # Backprop through the maxout activation - d_preacts = model.ops.backprop_maxout(d_statevecs, which, model.get_dim("nP")) - model.inc_grad("lower_b", d_preacts.sum(axis=0)) - model.inc_grad("lower_W", model.ops.gemm(d_preacts, tokfeats, trans1=True)) + d_preacts = model.ops.backprop_maxout(d_statevecs, which, nP) + d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP) + model.inc_grad("lower_b", d_preacts2f.sum(axis=0)) # We don't need to backprop the summation, because we pass back the IDs instead - d_state_features = backprop_feats((d_preacts, all_ids)) - ids1d = model.ops.xp.vstack(all_ids).flatten() - d_state_features = d_state_features.reshape((ids1d.size, -1)) - d_tokvecs = model.ops.alloc((tokvecs.shape[0] + 1, tokvecs.shape[1])) - model.ops.scatter_add(d_tokvecs, ids1d, d_state_features) - return (backprop_tok2vec(d_tokvecs), None) + d_state_features = backprop_feats((d_preacts2f, ids)) + d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1]) + model.ops.scatter_add(d_tokvecs, ids, d_state_features) + model.inc_grad("lower_pad", d_tokvecs[-1]) + return (backprop_tok2vec(d_tokvecs[:-1]), None) return (states, all_scores), backprop_parser @@ -314,7 +314,6 @@ def _forward_reference( return (states, all_scores), backprop_parser - def _get_unseen_mask(model: Model) -> Floats1d: mask = model.ops.alloc1f(model.get_dim("nO")) mask.fill(1) @@ -324,17 +323,18 @@ def _get_unseen_mask(model: Model) -> Floats1d: def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): - - W: Floats4d = model.get_param("lower_W") - pad: Floats4d = model.get_param("lower_pad") + W: Floats2d = model.get_param("lower_W") nF = model.get_dim("nF") nH = model.get_dim("nH") nP = model.get_dim("nP") nI = model.get_dim("nI") + # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI) + W3f = model.ops.reshape3f(W, nH * nP, nF, nI) + W3f = W3f.transpose((1, 0, 2)) + W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI) assert X.shape == (X.shape[0], nI), X.shape - Yf_ = model.ops.gemm(X, model.ops.reshape2f(W, nF * nH * nP, nI), trans2=True) - Yf = model.ops.reshape4f(Yf_, Yf_.shape[0], nF, nH, nP) - Yf = model.ops.xp.vstack((Yf, pad)) + Yf_ = model.ops.gemm(X, W2f, trans2=True) + Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP) def backward(dY_ids: Tuple[Floats3d, Ints2d]): # This backprop is particularly tricky, because we get back a different @@ -351,54 +351,15 @@ def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): # However, we avoid building that array for efficiency -- and just pass # in the indices. dY, ids = dY_ids - assert dY.ndim == 3 - assert dY.shape[1] == nH, dY.shape - assert dY.shape[2] == nP, dY.shape - # nB = dY.shape[0] - # model.inc_grad( - # "lower_pad", _backprop_precomputable_affine_padding(model, dY, ids) - # ) - # model.inc_grad("lower_b", dY.sum(axis=0)) # type: ignore - dY = model.ops.reshape2f(dY, dY.shape[0], nH * nP) - Wopfi = W.transpose((1, 2, 0, 3)) - Wopfi = Wopfi.reshape((nH * nP, nF * nI)) - dXf = model.ops.gemm(dY.reshape((dY.shape[0], nH * nP)), Wopfi) - ids1d = model.ops.xp.vstack(ids).flatten() - Xf = model.ops.reshape2f(X[ids1d], -1, nF * nI) - dWopfi = model.ops.gemm(dY, Xf, trans1=True) - dWopfi = dWopfi.reshape((nH, nP, nF, nI)) - # (o, p, f, i) --> (f, o, p, i) - dWopfi = dWopfi.transpose((2, 0, 1, 3)) - model.inc_grad("lower_W", dWopfi) + dXf = model.ops.gemm(dY, W) + Xf = X[ids].reshape((ids.shape[0], -1)) + dW = model.ops.gemm(dY, Xf, trans1=True) + model.inc_grad("lower_W", dW) return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI) return Yf, backward -def _backprop_precomputable_affine_padding(model, dY, ids): - ids = model.ops.xp.vstack(ids) - nB = dY.shape[0] - nF = model.get_dim("nF") - nP = model.get_dim("nP") - nH = model.get_dim("nH") - # Backprop the "padding", used as a filler for missing values. - # Values that are missing are set to -1, and each state vector could - # have multiple missing values. The padding has different values for - # different missing features. The gradient of the padding vector is: - # - # for b in range(nB): - # for f in range(nF): - # if ids[b, f] < 0: - # d_pad[f] += dY[b] - # - # Which can be rewritten as: - # - # (ids < 0).T @ dY - mask = model.ops.asarray(ids < 0, dtype="f") - d_pad = model.ops.gemm(mask, dY.reshape(nB, nH * nP), trans1=True) - return d_pad.reshape((1, nF, nH, nP)) - - def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]: if Y is None: return None From 394862b0f49605c6a96d5ab3b802caab08244510 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 1 Nov 2021 12:39:16 +0100 Subject: [PATCH 67/74] Start rigging beam back up --- spacy/pipeline/transition_parser.pyx | 56 +++++++++++++++++++--------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 108d20da8..b32aa29e5 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -7,6 +7,7 @@ from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free import random +import contextlib import srsly from thinc.api import set_dropout_rate, CupyOps, get_array_module @@ -210,14 +211,21 @@ class Parser(TrainablePipe): with self.model.use_params(params): yield + def __call__(self, Doc doc): + """Apply the parser or entity recognizer, setting the annotations onto + the `Doc` object. + + doc (Doc): The document to be processed. + """ + states = self.predict([doc]) + self.set_annotations([doc], states) + return doc + def pipe(self, docs, *, int batch_size=256): """Process a stream of documents. stream: The sequence of documents to process. batch_size (int): Number of documents to accumulate into a working set. - error_handler (Callable[[str, List[Doc], Exception], Any]): Function that - deals with a failing batch of documents. The default function just reraises - the exception. YIELDS (Doc): Documents, in order. """ @@ -242,27 +250,23 @@ class Parser(TrainablePipe): if not any(len(doc) for doc in docs): result = self.moves.init_batch(docs) return result - if self.cfg["beam_width"] == 1: - return self.greedy_parse(docs, drop=0.0) - else: - return self.beam_parse( - docs, - drop=0.0, - beam_width=self.cfg["beam_width"], - beam_density=self.cfg["beam_density"] - ) + with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]): + states_or_beams, _ = self.model.predict((docs, self.moves)) + return states_or_beams def greedy_parse(self, docs, drop=0.): - set_dropout_rate(self.model, drop) - # This is pretty dirty, but the NER can resize itself in init_batch, - # if labels are missing. We therefore have to check whether we need to - # expand our model output. + # Deprecated self._resize() - states, scores = self.model.predict((docs, self.moves)) + with _change_attrs(self.model, beam_width=1): + states, _ = self.model.predict((docs, self.moves)) return states def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.): - raise NotImplementedError + # Deprecated + self._resize() + with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]): + beams, _ = self.model.predict((docs, self.moves)) + return beams def set_annotations(self, docs, states_or_beams): cdef StateClass state @@ -461,3 +465,19 @@ class Parser(TrainablePipe): except AttributeError: raise ValueError(Errors.E149) from None return self + + +@contextlib.contextmanager +def _change_attrs(model, **kwargs): + """Temporarily modify a thinc model's attributes.""" + unset = object() + old_attrs = {} + for key, value in kwargs.items(): + old_attrs[key] = model.attrs.get(key, unset) + model.attrs[key] = value + yield model + for key, value in old_attrs.items(): + if value is unset: + model.attrs.pop(key) + else: + model.attrs[key] = value From 68e3d464b698073867ba5cb1546b3fb3f7e78e80 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 20 Jan 2022 16:48:47 +0100 Subject: [PATCH 68/74] removing redundant tests, cf #8106 --- spacy/tests/parser/test_ner.py | 35 -------------------------------- spacy/tests/parser/test_parse.py | 2 +- 2 files changed, 1 insertion(+), 36 deletions(-) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 5213d4d11..c7e4fb826 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -132,41 +132,6 @@ def test_negative_sample_key_is_in_config(vocab, entity_types): assert tsys.cfg["neg_key"] == "non_entities" -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots): - entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots] - example = Example.from_dict(doc, {"entities": entity_annots}) - ex_dict = example.to_dict() - - for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]): - if tag == "L-!GPE": - ex_dict["doc_annotation"]["entities"][i] = "-" - example = Example.from_dict(doc, ex_dict) - - act_classes = tsys.get_oracle_sequence(example) - names = [tsys.get_class_name(act) for act in act_classes] - assert names - - -def test_get_oracle_moves_negative_entities2(tsys, vocab): - doc = Doc(vocab, words=["A", "B", "C", "D"]) - entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"] - example = Example.from_dict(doc, {"entities": entity_annots}) - act_classes = tsys.get_oracle_sequence(example) - names = [tsys.get_class_name(act) for act in act_classes] - assert names - - -@pytest.mark.skip(reason="Maybe outdated? Unsure") -def test_get_oracle_moves_negative_O(tsys, vocab): - doc = Doc(vocab, words=["A", "B", "C", "D"]) - entity_annots = ["O", "!O", "O", "!O"] - example = Example.from_dict(doc, {"entities": entity_annots}) - act_classes = tsys.get_oracle_sequence(example) - names = [tsys.get_class_name(act) for act in act_classes] - assert names - - # We can't easily represent this on a Doc object. Not sure what the best solution # would be, but I don't think it's an important use case? @pytest.mark.skip(reason="No longer supported") diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 65c11620e..d597d353d 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -268,7 +268,7 @@ def test_overfitting_IO(pipe_name): train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for dep in annotations.get("deps", []): parser.add_label(dep) - #train_examples = train_examples[:1] + # train_examples = train_examples[:1] optimizer = nlp.initialize() # run overfitting for i in range(200): From 79469ced528f590380f24c0d882b4eb82cee8d04 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 20 Jan 2022 17:13:18 +0100 Subject: [PATCH 69/74] black formatting --- spacy/pipeline/dep_parser.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/dep_parser.py b/spacy/pipeline/dep_parser.py index 446c043f0..7cf11de64 100644 --- a/spacy/pipeline/dep_parser.py +++ b/spacy/pipeline/dep_parser.py @@ -227,6 +227,7 @@ def parser_score(examples, **kwargs): DOCS: https://spacy.io/api/dependencyparser#score """ + def has_sents(doc): return doc.has_annotation("SENT_START") @@ -234,8 +235,11 @@ def parser_score(examples, **kwargs): dep = getattr(token, attr) dep = token.vocab.strings.as_string(dep).lower() return dep + results = {} - results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) + results.update( + Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) + ) kwargs.setdefault("getter", dep_getter) kwargs.setdefault("ignore_labels", ("p", "punct")) results.update(Scorer.score_deps(examples, "dep", **kwargs)) From 4d9d9c5a2865a028a961acd13376127a0cf92057 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 20 Jan 2022 17:16:37 +0100 Subject: [PATCH 70/74] temporarily xfailing issue 4314 --- spacy/tests/parser/test_ner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 05a466d87..c7eef189a 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -181,6 +181,7 @@ def test_issue4267(): assert token.ent_iob == 2 +@pytest.mark.xfail(reason="no beam parser yet") @pytest.mark.issue(4313) def test_issue4313(): """This should not crash or exit with some strange error code""" From ca6aa239bc30cb1e895ebc2335be8316bd47fdda Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 20 Jan 2022 17:20:34 +0100 Subject: [PATCH 71/74] make flake8 happy again --- spacy/tests/test_misc.py | 54 ++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 0f804b42a..7374b827a 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -78,33 +78,33 @@ def test_util_get_package_path(package): assert isinstance(path, Path) -@pytest.mark.xfail(reason="No precomputable affine") -def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): - model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize() - assert model.get_param("W").shape == (nF, nO, nP, nI) - tensor = model.ops.alloc((10, nI)) - Y, get_dX = model.begin_update(tensor) - assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP) - dY = model.ops.alloc((15, nO, nP)) - ids = model.ops.alloc((15, nF)) - ids[1, 2] = -1 - dY[1] = 1 - assert not model.has_grad("pad") - d_pad = _backprop_precomputable_affine_padding(model, dY, ids) - assert d_pad[0, 2, 0, 0] == 1.0 - ids.fill(0.0) - dY.fill(0.0) - dY[0] = 0 - ids[1, 2] = 0 - ids[1, 1] = -1 - ids[1, 0] = -1 - dY[1] = 1 - ids[2, 0] = -1 - dY[2] = 5 - d_pad = _backprop_precomputable_affine_padding(model, dY, ids) - assert d_pad[0, 0, 0, 0] == 6 - assert d_pad[0, 1, 0, 0] == 1 - assert d_pad[0, 2, 0, 0] == 0 +# @pytest.mark.skip(reason="No precomputable affine") +# def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): +# model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize() +# assert model.get_param("W").shape == (nF, nO, nP, nI) +# tensor = model.ops.alloc((10, nI)) +# Y, get_dX = model.begin_update(tensor) +# assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP) +# dY = model.ops.alloc((15, nO, nP)) +# ids = model.ops.alloc((15, nF)) +# ids[1, 2] = -1 +# dY[1] = 1 +# assert not model.has_grad("pad") +# d_pad = _backprop_precomputable_affine_padding(model, dY, ids) +# assert d_pad[0, 2, 0, 0] == 1.0 +# ids.fill(0.0) +# dY.fill(0.0) +# dY[0] = 0 +# ids[1, 2] = 0 +# ids[1, 1] = -1 +# ids[1, 0] = -1 +# dY[1] = 1 +# ids[2, 0] = -1 +# dY[2] = 5 +# d_pad = _backprop_precomputable_affine_padding(model, dY, ids) +# assert d_pad[0, 0, 0, 0] == 6 +# assert d_pad[0, 1, 0, 0] == 1 +# assert d_pad[0, 2, 0, 0] == 0 def test_prefer_gpu(): From 6d32ae01daeba20b3d3f1abab8ce8906aac34e3a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 20 Jan 2022 17:50:37 +0100 Subject: [PATCH 72/74] mypy fixes --- spacy/ml/tb_framework.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 753c99cb9..9aac5b801 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Any, Optional +from typing import List, Tuple, Any, Optional, cast from thinc.api import Ops, Model, normal_init, chain, list2array, Linear from thinc.api import uniform_init, glorot_uniform_init, zero_init from thinc.types import Floats1d, Floats2d, Floats3d, Ints2d, Floats4d @@ -399,10 +399,10 @@ def _lsuv_init(model: Model): model.set_param("b", b) model.set_param("pad", pad) - ids = ops.alloc((5000, nF), dtype="f") + ids = ops.alloc_f((5000, nF), dtype="f") ids += ops.xp.random.uniform(0, 1000, ids.shape) ids = ops.asarray(ids, dtype="i") - tokvecs = ops.alloc((5000, nI), dtype="f") + tokvecs = ops.alloc_f((5000, nI), dtype="f") tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape( tokvecs.shape ) @@ -421,8 +421,8 @@ def _lsuv_init(model: Model): tol_var = 0.01 tol_mean = 0.01 t_max = 10 - W = model.get_param("lower_W").copy() - b = model.get_param("lower_b").copy() + W = cast(Floats4d, model.get_param("lower_W").copy()) + b = cast(Floats2d, model.get_param("lower_b").copy()) for t_i in range(t_max): acts1 = predict(ids, tokvecs) var = model.ops.xp.var(acts1) From c4c41b14cf9552d1b1d4cb51cf659d0cd08c99c1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 20 Jan 2022 18:00:31 +0100 Subject: [PATCH 73/74] ensure labels are added upon predict --- spacy/pipeline/transition_parser.pyx | 5 +++-- spacy/tests/parser/test_add_label.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 79e089065..c5591a9f3 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -240,6 +240,7 @@ class Parser(TrainablePipe): def predict(self, docs): if isinstance(docs, Doc): docs = [docs] + self._ensure_labels_are_added(docs) if not any(len(doc) for doc in docs): result = self.moves.init_batch(docs) return result @@ -248,14 +249,14 @@ class Parser(TrainablePipe): return states_or_beams def greedy_parse(self, docs, drop=0.): - # Deprecated + # TODO: Deprecated self._resize() with _change_attrs(self.model, beam_width=1): states, _ = self.model.predict((docs, self.moves)) return states def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.): - # Deprecated + # TODO: Deprecated self._resize() with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]): beams, _ = self.model.predict((docs, self.moves)) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 540b00f89..4c775a913 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -135,6 +135,7 @@ def test_ner_labels_added_implicitly_on_beam_parse(): assert "D" in ner.labels +@pytest.mark.skip(reason="greedy_parse is deprecated") def test_ner_labels_added_implicitly_on_greedy_parse(): nlp = Language() ner = nlp.add_pipe("beam_ner") From 6243ac35eb1b59e63addafbf630ac978d7a604d8 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 20 Jan 2022 18:14:26 +0100 Subject: [PATCH 74/74] cleanup remnants from merge conflicts --- spacy/tests/regression/test_issue4001-4500.py | 0 spacy/tokens/doc.pyx | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 spacy/tests/regression/test_issue4001-4500.py diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index eeb7dc965..5a0db115d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -616,7 +616,7 @@ cdef class Doc: """ if "has_vector" in self.user_hooks: return self.user_hooks["has_vector"](self) - elif self.vocab.vectors.data.size: + elif self.vocab.vectors.size: return True elif self.tensor.size: return True