Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-11-08 20:07:51 +03:00 · 2018-12-30 15:49:57 +01:00 · 2018-12-30 15:49:57 +01:00 · 3c09d3d986
commit 3c09d3d986
parent d8d0ce081b 63b7accd74
5 changed files with 56 additions and 14 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -152,6 +152,9 @@ def test_span_as_doc(doc):
    span = doc[4:10]
    span_doc = span.as_doc()
    assert span.text == span_doc.text.strip()
    assert isinstance(span_doc, doc.__class__)
    assert span_doc is not doc
    assert span_doc[0].idx == 0
 def test_span_string_label(doc):
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -247,6 +247,16 @@ def test_issue1945():
    assert matches[1][1:] == (1, 3)
 def test_issue1963(en_tokenizer):
    """Test that doc.merge() resizes doc.tensor"""
    doc = en_tokenizer('a b c d')
    doc.tensor = numpy.ones((len(doc), 128), dtype='f')
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2])
    assert len(doc) == 3
    assert doc.tensor.shape == (3, 128)
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
 def test_issue1967(label):
    ner = EntityRecognizer(Vocab())
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -7,7 +7,9 @@ from __future__ import unicode_literals
 from libc.string cimport memcpy, memset
 from libc.stdlib cimport malloc, free
 import numpy
 from cymem.cymem cimport Pool
 from thinc.neural.util import get_array_module
 from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end
 from .span cimport Span
@ -83,6 +85,11 @@ def _merge(Doc doc, int start, int end, attributes):
    cdef Span span = doc[start:end]
    cdef int start_char = span.start_char
    cdef int end_char = span.end_char
    # Resize the doc.tensor, if it's set. Let the last row for each token stand
    # for the merged region. To do this, we create a boolean array indicating
    # whether the row is to be deleted, then use numpy.delete
    if doc.tensor is not None and doc.tensor.size != 0:
        doc.tensor = _resize_tensor(doc.tensor, [(start, end)])
    # Get LexemeC for newly merged token
    new_orth = ''.join([t.text_with_ws for t in span])
    if span[-1].whitespace_:
@ -182,7 +189,12 @@ def _bulk_merge(Doc doc, merges):
            else:
                Token.set_struct_attr(token, attr_name, attr_value)
-
+    # Resize the doc.tensor, if it's set. Let the last row for each token stand
    # for the merged region. To do this, we create a boolean array indicating
    # whether the row is to be deleted, then use numpy.delete
    if doc.tensor is not None and doc.tensor.size != 0:
        doc.tensor = _resize_tensor(doc.tensor,
            [(m[1][0].start, m[1][0].end) for m in merges])
    # Memorize span roots and sets dependencies of the newly merged
    # tokens to the dependencies of their roots.
    span_roots = []
@ -276,6 +288,14 @@ def _bulk_merge(Doc doc, merges):
                else:
                    # If they're not the same entity type, let them be two entities
                    doc.c[token_after_span_position].ent_iob = 3
    # Return the merged Python object
    return doc[spans[0].start]
 def _resize_tensor(tensor, ranges):
    delete = []
    for start, end in ranges:
        for i in range(start, end-1):
            delete.append(i)
    xp = get_array_module(tensor)
    return xp.delete(tensor, delete, axis=0)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -14,7 +14,7 @@ from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
 from ..util import normalize_slice
-from ..attrs cimport IS_PUNCT, IS_SPACE
+from ..attrs cimport *
 from ..lexeme cimport Lexeme
 from ..compat import is_config, basestring_
 from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
@ -149,23 +149,32 @@ cdef class Span:
    def as_doc(self):
        # TODO: fix
-        """Create a `Doc` object view of the Span's data. This is mostly
+        """Create a `Doc` object with a copy of the Span's data.
        useful for C-typed interfaces.
-        RETURNS (Doc): The `Doc` view of the span.
+        RETURNS (Doc): The `Doc` copy of the span.
        """
-        cdef Doc doc = Doc(self.doc.vocab)
+        cdef Doc doc = Doc(self.doc.vocab,
-        doc.length = self.end-self.start
+            words=[t.text for t in self],
-        doc.c = &self.doc.c[self.start]
+            spaces=[bool(t.whitespace_) for t in self])
-        doc.mem = self.doc.mem
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
-        doc.is_parsed = self.doc.is_parsed
+        if self.doc.is_tagged:
-        doc.is_tagged = self.doc.is_tagged
+            array_head.append(TAG)
        # if doc parsed add head and dep attribute
        if self.doc.is_parsed:
            array_head.extend([HEAD, DEP])
        # otherwise add sent_start
        else:
            array_head.append(SENT_START)
        array = self.doc.to_array(array_head)
        doc.from_array(array_head, array[self.start : self.end])
        doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
        doc.user_hooks = self.doc.user_hooks
        doc.user_span_hooks = self.doc.user_span_hooks
        doc.user_token_hooks = self.doc.user_token_hooks
        doc.vector = self.vector
        doc.vector_norm = self.vector_norm
        doc.tensor = self.doc.tensor[self.start : self.end]
        for key, value in self.doc.cats.items():
            if hasattr(key, '__len__') and len(key) == 3:
                cat_start, cat_end, cat_label = key
--- a/website/api/span.jade
+++ b/website/api/span.jade
@ -377,8 +377,8 @@ p
 +h(2, "as_doc") Span.as_doc
 p
-    |  Create a #[code Doc] object view of the #[code Span]'s data. Mostly
+    |  Create a new #[code Doc] object corresponding to the #[code Span], with
-    |  useful for C-typed interfaces.
+    |  a copy of the data.
 +aside-code("Example").
    doc = nlp(u'I like New York in Autumn.')