diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 11fc0f228..495ed47be 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -152,6 +152,9 @@ def test_span_as_doc(doc): span = doc[4:10] span_doc = span.as_doc() assert span.text == span_doc.text.strip() + assert isinstance(span_doc, doc.__class__) + assert span_doc is not doc + assert span_doc[0].idx == 0 def test_span_string_label(doc): diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index a646afadc..580740a84 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -247,6 +247,16 @@ def test_issue1945(): assert matches[1][1:] == (1, 3) +def test_issue1963(en_tokenizer): + """Test that doc.merge() resizes doc.tensor""" + doc = en_tokenizer('a b c d') + doc.tensor = numpy.ones((len(doc), 128), dtype='f') + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[0:2]) + assert len(doc) == 3 + assert doc.tensor.shape == (3, 128) + + @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): ner = EntityRecognizer(Vocab()) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 9781b629b..1d1f0e1dc 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -7,7 +7,9 @@ from __future__ import unicode_literals from libc.string cimport memcpy, memset from libc.stdlib cimport malloc, free +import numpy from cymem.cymem cimport Pool +from thinc.neural.util import get_array_module from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end from .span cimport Span @@ -83,6 +85,11 @@ def _merge(Doc doc, int start, int end, attributes): cdef Span span = doc[start:end] cdef int start_char = span.start_char cdef int end_char = span.end_char + # Resize the doc.tensor, if it's set. Let the last row for each token stand + # for the merged region. To do this, we create a boolean array indicating + # whether the row is to be deleted, then use numpy.delete + if doc.tensor is not None and doc.tensor.size != 0: + doc.tensor = _resize_tensor(doc.tensor, [(start, end)]) # Get LexemeC for newly merged token new_orth = ''.join([t.text_with_ws for t in span]) if span[-1].whitespace_: @@ -182,7 +189,12 @@ def _bulk_merge(Doc doc, merges): else: Token.set_struct_attr(token, attr_name, attr_value) - + # Resize the doc.tensor, if it's set. Let the last row for each token stand + # for the merged region. To do this, we create a boolean array indicating + # whether the row is to be deleted, then use numpy.delete + if doc.tensor is not None and doc.tensor.size != 0: + doc.tensor = _resize_tensor(doc.tensor, + [(m[1][0].start, m[1][0].end) for m in merges]) # Memorize span roots and sets dependencies of the newly merged # tokens to the dependencies of their roots. span_roots = [] @@ -276,6 +288,14 @@ def _bulk_merge(Doc doc, merges): else: # If they're not the same entity type, let them be two entities doc.c[token_after_span_position].ent_iob = 3 - # Return the merged Python object return doc[spans[0].start] + + +def _resize_tensor(tensor, ranges): + delete = [] + for start, end in ranges: + for i in range(start, end-1): + delete.append(i) + xp = get_array_module(tensor) + return xp.delete(tensor, delete, axis=0) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 44ce04f34..440cf1859 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -14,7 +14,7 @@ from ..typedefs cimport flags_t, attr_t, hash_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t from ..util import normalize_slice -from ..attrs cimport IS_PUNCT, IS_SPACE +from ..attrs cimport * from ..lexeme cimport Lexeme from ..compat import is_config, basestring_ from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning @@ -149,23 +149,32 @@ cdef class Span: def as_doc(self): # TODO: fix - """Create a `Doc` object view of the Span's data. This is mostly - useful for C-typed interfaces. + """Create a `Doc` object with a copy of the Span's data. - RETURNS (Doc): The `Doc` view of the span. + RETURNS (Doc): The `Doc` copy of the span. """ - cdef Doc doc = Doc(self.doc.vocab) - doc.length = self.end-self.start - doc.c = &self.doc.c[self.start] - doc.mem = self.doc.mem - doc.is_parsed = self.doc.is_parsed - doc.is_tagged = self.doc.is_tagged + cdef Doc doc = Doc(self.doc.vocab, + words=[t.text for t in self], + spaces=[bool(t.whitespace_) for t in self]) + array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] + if self.doc.is_tagged: + array_head.append(TAG) + # if doc parsed add head and dep attribute + if self.doc.is_parsed: + array_head.extend([HEAD, DEP]) + # otherwise add sent_start + else: + array_head.append(SENT_START) + array = self.doc.to_array(array_head) + doc.from_array(array_head, array[self.start : self.end]) + doc.noun_chunks_iterator = self.doc.noun_chunks_iterator doc.user_hooks = self.doc.user_hooks doc.user_span_hooks = self.doc.user_span_hooks doc.user_token_hooks = self.doc.user_token_hooks doc.vector = self.vector doc.vector_norm = self.vector_norm + doc.tensor = self.doc.tensor[self.start : self.end] for key, value in self.doc.cats.items(): if hasattr(key, '__len__') and len(key) == 3: cat_start, cat_end, cat_label = key diff --git a/website/api/span.jade b/website/api/span.jade index e13fa29a5..7b098123d 100644 --- a/website/api/span.jade +++ b/website/api/span.jade @@ -377,8 +377,8 @@ p +h(2, "as_doc") Span.as_doc p - | Create a #[code Doc] object view of the #[code Span]'s data. Mostly - | useful for C-typed interfaces. + | Create a new #[code Doc] object corresponding to the #[code Span], with + | a copy of the data. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.')