Fix/span.sent (#6083)

* add fail test

* fix test

* fix span.sent

* Remove incorrect implicit check

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Yohei Tamura 2020-10-01 21:01:52 +09:00 committed by GitHub
parent 4cbb954281
commit 3243ddac8f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 6 deletions

View File

@ -174,19 +174,25 @@ def test_spans_by_character(doc):
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
span2 = doc.char_span(
span1.start_char, span1.end_char, label="GPE", alignment_mode="strict"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
# alignment mode "contract"
span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
span2 = doc.char_span(
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
# alignment mode "expand"
span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
span2 = doc.char_span(
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
@ -318,3 +324,11 @@ def test_span_boundaries(doc):
_ = span[-5]
with pytest.raises(IndexError):
_ = span[5]
def test_sent(en_tokenizer):
doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.")
span = doc[1:3]
assert not span.doc.is_sentenced
with pytest.raises(ValueError):
span.sent

View File

@ -391,8 +391,6 @@ cdef class Span:
"""RETURNS (Span): The sentence span that the span is a part of."""
if "sent" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["sent"](self)
# This should raise if not parsed / no custom sentence boundaries
self.doc.sents
# Use `sent_start` token attribute to find sentence boundaries
cdef int n = 0
if self.doc.is_sentenced:
@ -402,13 +400,14 @@ cdef class Span:
start += -1
# Find end of the sentence
end = self.end
n = 0
while end < self.doc.length and self.doc.c[end].sent_start != 1:
end += 1
n += 1
if n >= self.doc.length:
break
return self.doc[start:end]
else:
raise ValueError(Errors.E030)
@property
def ents(self):