From 3e3af016811e8eecb43e72fc198e1a5b779971ce Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 31 Mar 2018 19:32:37 +0200 Subject: [PATCH] Add notes for adding retokenize.split() --- spacy/tokens/_retokenize.pyx | 41 +++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 00f724ed6..b9d3c2160 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -11,6 +11,7 @@ from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC +from ..typedefs cimport attr_t from ..attrs cimport * @@ -49,6 +50,21 @@ cdef class Retokenizer: raise NotImplementedError +def _fix_parse_tree(Doc doc, parse_tree): + # Currently unused -- placeholder + cdef attr_t label + cdef int token_idx, head_idx, child, head + for token_idx, (head_idx, label) in parse_tree.items(): + child = token_by_start(doc.c, doc.length, token_idx) + head = token_by_start(doc.c, doc.length, head_idx) + if head != -1 and child != -1: + doc.c[child].head = head-child + doc.c[child].dep = label + else: + raise NotImplementedError + set_children_from_heads(doc.c, doc.length) + + def _merge(Doc doc, int start, int end, attributes): """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` is merged into a single token. If @@ -126,4 +142,27 @@ def _merge(Doc doc, int start, int end, attributes): # Return the merged Python object return doc[start] - +# cdef int PADDING = 5 +# cdef int j +# # Unwind the padding, so we can work with the original pointer. +# this._sent -= PADDING +# this._sent = realloc(this._sent, +# ((this.length+n+1) + (PADDING * 2)) * sizeof(TokenC)) +# for j in range(this.length+PADDING*2, this.length+n+1+PADDING*2): +# this._sent[j] = this._empty_token +# # Put the start padding back in +# this._sent += PADDING +# # In our example, we want to move words 6-10 to 8-12. So we must move +# # a block of 4 words. +# cdef int n_moved = this.length - (i+1) +# cdef int move_from = i+1 +# cdef int move_to = i+n+1 +# memmove(&this._sent[move_to], &this._sent[move_from], +# n_moved*sizeof(TokenC)) +# # Now copy the token that has been split into its neighbours. +# for j in range(i+1, i+n+1): +# this._sent[j] = this._sent[i] +# # Finally, adjust length. +# this.length += n +# +#