From 0e71bd973fa39e9b39f2ff4e023a42cde2064ff7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 15 Apr 2022 15:34:58 +0200 Subject: [PATCH] Return doc offsets in Matcher on spans (#10576) The returned match offsets were only adjusted for `as_spans`, not generally. Because the `on_match` callbacks are always applied to the doc, the `Matcher` matches on spans should consistently use the doc offsets. --- spacy/matcher/matcher.pyx | 7 ++++--- spacy/tests/matcher/test_matcher_api.py | 13 ++++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 745d7cf43..863bd198c 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -252,6 +252,10 @@ cdef class Matcher: # non-overlapping ones this `match` can be either (start, end) or # (start, end, alignments) depending on `with_alignments=` option. for key, *match in matches: + # Adjust span matches to doc offsets + if isinstance(doclike, Span): + match[0] += doclike.start + match[1] += doclike.start span_filter = self._filter.get(key) if span_filter is not None: pairs = pairs_by_id.get(key, []) @@ -282,9 +286,6 @@ cdef class Matcher: if as_spans: final_results = [] for key, start, end, *_ in final_matches: - if isinstance(doclike, Span): - start += doclike.start - end += doclike.start final_results.append(Span(doc, start, end, label=key)) elif with_alignments: # convert alignments List[Dict[str, int]] --> List[int] diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index c02d65cdf..9d401382f 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -591,9 +591,16 @@ def test_matcher_span(matcher): doc = Doc(matcher.vocab, words=text.split()) span_js = doc[:3] span_java = doc[4:] - assert len(matcher(doc)) == 2 - assert len(matcher(span_js)) == 1 - assert len(matcher(span_java)) == 1 + doc_matches = matcher(doc) + span_js_matches = matcher(span_js) + span_java_matches = matcher(span_java) + assert len(doc_matches) == 2 + assert len(span_js_matches) == 1 + assert len(span_java_matches) == 1 + + # match offsets always refer to the doc + assert doc_matches[0] == span_js_matches[0] + assert doc_matches[1] == span_java_matches[0] def test_matcher_as_spans(matcher):