From 1707e77c5e146cebbc553496a1e8894a3facba00 Mon Sep 17 00:00:00 2001 From: tamuhey Date: Fri, 13 Dec 2019 23:54:58 +0900 Subject: [PATCH] add char_span to Span (#4793) --- spacy/tests/doc/test_span.py | 18 ++++++++++++++++++ spacy/tokens/span.pyx | 16 ++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index f813a9743..01bb93c50 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -32,6 +32,24 @@ def doc_not_parsed(en_tokenizer): return doc +@pytest.mark.parametrize( + "i_sent,i,j,text", + [ + (0, 0, len("This is a"), "This is a"), + (1, 0, len("This is another"), "This is another"), + (2, len("And "), len("And ") + len("a third"), "a third"), + (0, 1, 2, None), + ], +) +def test_char_span(doc, i_sent, i, j, text): + sents = list(doc.sents) + span = sents[i_sent].char_span(i, j) + if not text: + assert not span + else: + assert span.text == text + + def test_spans_sent_spans(doc): sents = list(doc.sents) assert sents[0].start == 0 diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9e99392a9..957e853ca 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -584,6 +584,22 @@ cdef class Span: else: return self.doc[root] + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None): + """Create a `Span` object from the slice `span.text[start : end]`. + + start (int): The index of the first character of the span. + end (int): The index of the first character after the span. + label (uint64 or string): A label to attach to the Span, e.g. for + named entities. + kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation of + the span. + RETURNS (Span): The newly constructed object. + """ + start_idx += self.start_char + end_idx += self.start_char + return self.doc.char_span(start_idx, end_idx) + @property def conjuncts(self): """Tokens that are conjoined to the span's root.