diff --git a/setup.py b/setup.py index 00232cc85..fedad560f 100755 --- a/setup.py +++ b/setup.py @@ -38,6 +38,7 @@ MOD_NAMES = [ 'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token', + 'spacy.tokens._retokenize', 'spacy.matcher', 'spacy.syntax.ner', 'spacy.symbols', diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx new file mode 100644 index 000000000..00f724ed6 --- /dev/null +++ b/spacy/tokens/_retokenize.pyx @@ -0,0 +1,129 @@ +# coding: utf8 +# cython: infer_types=True +# cython: bounds_check=False +# cython: profile=True +from __future__ import unicode_literals + +from libc.string cimport memcpy, memset + +from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end +from .span cimport Span +from .token cimport Token +from ..lexeme cimport Lexeme, EMPTY_LEXEME +from ..structs cimport LexemeC, TokenC +from ..attrs cimport * + + +cdef class Retokenizer: + '''Helper class for doc.retokenize() context manager.''' + cdef Doc doc + cdef list merges + cdef list splits + def __init__(self, doc): + self.doc = doc + self.merges = [] + self.splits = [] + + def merge(self, Span span, attrs=None): + '''Mark a span for merging. The attrs will be applied to the resulting + token.''' + self.merges.append((span.start_char, span.end_char, attrs)) + + def split(self, Token token, orths, attrs=None): + '''Mark a Token for splitting, into the specified orths. The attrs + will be applied to each subtoken.''' + self.splits.append((token.start_char, orths, attrs)) + + def __enter__(self): + self.merges = [] + self.splits = [] + return self + + def __exit__(self, *args): + # Do the actual merging here + for start_char, end_char, attrs in self.merges: + start = token_by_start(self.doc.c, self.doc.length, start_char) + end = token_by_end(self.doc.c, self.doc.length, end_char) + _merge(self.doc, start, end+1, attrs) + for start_char, orths, attrs in self.splits: + raise NotImplementedError + + +def _merge(Doc doc, int start, int end, attributes): + """Retokenize the document, such that the span at + `doc.text[start_idx : end_idx]` is merged into a single token. If + `start_idx` and `end_idx `do not mark start and end token boundaries, + the document remains unchanged. + + start_idx (int): Character index of the start of the slice to merge. + end_idx (int): Character index after the end of the slice to merge. + **attributes: Attributes to assign to the merged token. By default, + attributes are inherited from the syntactic root of the span. + RETURNS (Token): The newly merged token, or `None` if the start and end + indices did not fall at token boundaries. + """ + cdef Span span = doc[start:end] + cdef int start_char = span.start_char + cdef int end_char = span.end_char + # Get LexemeC for newly merged token + new_orth = ''.join([t.text_with_ws for t in span]) + if span[-1].whitespace_: + new_orth = new_orth[:-len(span[-1].whitespace_)] + cdef const LexemeC* lex = doc.vocab.get(doc.mem, new_orth) + # House the new merged token where it starts + cdef TokenC* token = &doc.c[start] + token.spacy = doc.c[end-1].spacy + for attr_name, attr_value in attributes.items(): + if attr_name == TAG: + doc.vocab.morphology.assign_tag(token, attr_value) + else: + Token.set_struct_attr(token, attr_name, attr_value) + # Make sure ent_iob remains consistent + if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2): + if token.ent_type == doc.c[end].ent_type: + token.ent_iob = 3 + else: + # If they're not the same entity type, let them be two entities + doc.c[end].ent_iob = 3 + # Begin by setting all the head indices to absolute token positions + # This is easier to work with for now than the offsets + # Before thinking of something simpler, beware the case where a + # dependency bridges over the entity. Here the alignment of the + # tokens changes. + span_root = span.root.i + token.dep = span.root.dep + # We update token.lex after keeping span root and dep, since + # setting token.lex will change span.start and span.end properties + # as it modifies the character offsets in the doc + token.lex = lex + for i in range(doc.length): + doc.c[i].head += i + # Set the head of the merged token, and its dep relation, from the Span + token.head = doc.c[span_root].head + # Adjust deps before shrinking tokens + # Tokens which point into the merged token should now point to it + # Subtract the offset from all tokens which point to >= end + offset = (end - start) - 1 + for i in range(doc.length): + head_idx = doc.c[i].head + if start <= head_idx < end: + doc.c[i].head = start + elif head_idx >= end: + doc.c[i].head -= offset + # Now compress the token array + for i in range(end, doc.length): + doc.c[i - offset] = doc.c[i] + for i in range(doc.length - offset, doc.length): + memset(&doc.c[i], 0, sizeof(TokenC)) + doc.c[i].lex = &EMPTY_LEXEME + doc.length -= offset + for i in range(doc.length): + # ...And, set heads back to a relative position + doc.c[i].head -= i + # Set the left/right children, left/right edges + set_children_from_heads(doc.c, doc.length) + # Clear the cached Python objects + # Return the merged Python object + return doc[start] + + diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e3fbb4552..c7eac15c0 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -34,6 +34,7 @@ from ..compat import is_config, copy_reg, pickle, basestring_ from .. import about from .. import util from .underscore import Underscore +from ._retokenize import Retokenizer DEF PADDING = 5 @@ -888,6 +889,18 @@ cdef class Doc: else: self.tensor = xp.hstack((self.tensor, tensor)) + def retokenize(self): + '''Context manager to handle retokenization of the Doc. + Modifications to the Doc's tokenization are stored, and then + made all at once when the context manager exits. This is + much more efficient, and less error-prone. + + All views of the Doc (Span and Token) created before the + retokenization are invalidated, although they may accidentally + continue to work. + ''' + return Retokenizer(self) + def merge(self, int start_idx, int end_idx, *args, **attributes): """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` is merged into a single token. If @@ -941,66 +954,8 @@ cdef class Doc: return None # Currently we have the token index, we want the range-end index end += 1 - cdef Span span = self[start:end] - # Get LexemeC for newly merged token - new_orth = ''.join([t.text_with_ws for t in span]) - if span[-1].whitespace_: - new_orth = new_orth[:-len(span[-1].whitespace_)] - cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) - # House the new merged token where it starts - cdef TokenC* token = &self.c[start] - token.spacy = self.c[end-1].spacy - for attr_name, attr_value in attributes.items(): - if attr_name == TAG: - self.vocab.morphology.assign_tag(token, attr_value) - else: - Token.set_struct_attr(token, attr_name, attr_value) - # Make sure ent_iob remains consistent - if self.c[end].ent_iob == 1 and token.ent_iob in (0, 2): - if token.ent_type == self.c[end].ent_type: - token.ent_iob = 3 - else: - # If they're not the same entity type, let them be two entities - self.c[end].ent_iob = 3 - # Begin by setting all the head indices to absolute token positions - # This is easier to work with for now than the offsets - # Before thinking of something simpler, beware the case where a - # dependency bridges over the entity. Here the alignment of the - # tokens changes. - span_root = span.root.i - token.dep = span.root.dep - # We update token.lex after keeping span root and dep, since - # setting token.lex will change span.start and span.end properties - # as it modifies the character offsets in the doc - token.lex = lex - for i in range(self.length): - self.c[i].head += i - # Set the head of the merged token, and its dep relation, from the Span - token.head = self.c[span_root].head - # Adjust deps before shrinking tokens - # Tokens which point into the merged token should now point to it - # Subtract the offset from all tokens which point to >= end - offset = (end - start) - 1 - for i in range(self.length): - head_idx = self.c[i].head - if start <= head_idx < end: - self.c[i].head = start - elif head_idx >= end: - self.c[i].head -= offset - # Now compress the token array - for i in range(end, self.length): - self.c[i - offset] = self.c[i] - for i in range(self.length - offset, self.length): - memset(&self.c[i], 0, sizeof(TokenC)) - self.c[i].lex = &EMPTY_LEXEME - self.length -= offset - for i in range(self.length): - # ...And, set heads back to a relative position - self.c[i].head -= i - # Set the left/right children, left/right edges - set_children_from_heads(self.c, self.length) - # Clear the cached Python objects - # Return the merged Python object + with self.retokenize() as retokenizer: + retokenizer.merge(self[start:end], attrs=attributes) return self[start] def print_tree(self, light=False, flat=False):