From 1707e77c5e146cebbc553496a1e8894a3facba00 Mon Sep 17 00:00:00 2001
From: tamuhey <tamuhey@gmail.com>
Date: Fri, 13 Dec 2019 23:54:58 +0900
Subject: [PATCH] add char_span to Span (#4793)

---
 spacy/tests/doc/test_span.py | 18 ++++++++++++++++++
 spacy/tokens/span.pyx        | 16 ++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index f813a9743..01bb93c50 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -32,6 +32,24 @@ def doc_not_parsed(en_tokenizer):
     return doc
 
 
+@pytest.mark.parametrize(
+    "i_sent,i,j,text",
+    [
+        (0, 0, len("This is a"), "This is a"),
+        (1, 0, len("This is another"), "This is another"),
+        (2, len("And "), len("And ") + len("a third"), "a third"),
+        (0, 1, 2, None),
+    ],
+)
+def test_char_span(doc, i_sent, i, j, text):
+    sents = list(doc.sents)
+    span = sents[i_sent].char_span(i, j)
+    if not text:
+        assert not span
+    else:
+        assert span.text == text
+
+
 def test_spans_sent_spans(doc):
     sents = list(doc.sents)
     assert sents[0].start == 0
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 9e99392a9..957e853ca 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -584,6 +584,22 @@ cdef class Span:
         else:
             return self.doc[root]
 
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
+        """Create a `Span` object from the slice `span.text[start : end]`.
+
+        start (int): The index of the first character of the span.
+        end (int): The index of the first character after the span.
+        label (uint64 or string): A label to attach to the Span, e.g. for
+            named entities.
+        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
+        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
+            the span.
+        RETURNS (Span): The newly constructed object.
+        """
+        start_idx += self.start_char
+        end_idx += self.start_char
+        return self.doc.char_span(start_idx, end_idx)
+
     @property
     def conjuncts(self):
         """Tokens that are conjoined to the span's root.