Return doc offsets in Matcher on spans (#10576)

The returned match offsets were only adjusted for `as_spans`, not
generally. Because the `on_match` callbacks are always applied to the
doc, the `Matcher` matches on spans should consistently use the doc
offsets.
This commit is contained in:
Adriane Boyd 2022-04-15 15:34:58 +02:00 committed by GitHub
parent 75f7c15187
commit 0e71bd973f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 6 deletions

View File

@ -252,6 +252,10 @@ cdef class Matcher:
# non-overlapping ones this `match` can be either (start, end) or
# (start, end, alignments) depending on `with_alignments=` option.
for key, *match in matches:
# Adjust span matches to doc offsets
if isinstance(doclike, Span):
match[0] += doclike.start
match[1] += doclike.start
span_filter = self._filter.get(key)
if span_filter is not None:
pairs = pairs_by_id.get(key, [])
@ -282,9 +286,6 @@ cdef class Matcher:
if as_spans:
final_results = []
for key, start, end, *_ in final_matches:
if isinstance(doclike, Span):
start += doclike.start
end += doclike.start
final_results.append(Span(doc, start, end, label=key))
elif with_alignments:
# convert alignments List[Dict[str, int]] --> List[int]

View File

@ -591,9 +591,16 @@ def test_matcher_span(matcher):
doc = Doc(matcher.vocab, words=text.split())
span_js = doc[:3]
span_java = doc[4:]
assert len(matcher(doc)) == 2
assert len(matcher(span_js)) == 1
assert len(matcher(span_java)) == 1
doc_matches = matcher(doc)
span_js_matches = matcher(span_js)
span_java_matches = matcher(span_java)
assert len(doc_matches) == 2
assert len(span_js_matches) == 1
assert len(span_java_matches) == 1
# match offsets always refer to the doc
assert doc_matches[0] == span_js_matches[0]
assert doc_matches[1] == span_java_matches[0]
def test_matcher_as_spans(matcher):