Fix span offsets for Matcher(as_spans) on spans (#7992)

Fix returned span offsets for `Matcher(as_spans=True)(span)`.
This commit is contained in:
Adriane Boyd 2021-05-06 10:42:44 +02:00 committed by GitHub
parent 7d5db41ac3
commit 0a22fed634
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 13 additions and 1 deletions

View File

@ -284,7 +284,13 @@ cdef class Matcher:
if on_match is not None:
on_match(self, doc, i, final_matches)
if as_spans:
return [Span(doc, start, end, label=key) for key, start, end in final_matches]
spans = []
for key, start, end in final_matches:
if isinstance(doclike, Span):
start += doclike.start
end += doclike.start
spans.append(Span(doc, start, end, label=key))
return spans
elif with_alignments:
# convert alignments List[Dict[str, int]] --> List[int]
final_matches = []

View File

@ -513,6 +513,12 @@ def test_matcher_as_spans(matcher):
assert matches[1].text == "Java"
assert matches[1].label_ == "Java"
matches = matcher(doc[1:], as_spans=True)
assert len(matches) == 1
assert isinstance(matches[0], Span)
assert matches[0].text == "Java"
assert matches[0].label_ == "Java"
def test_matcher_deprecated(matcher):
doc = Doc(matcher.vocab, words=["hello", "world"])