diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 0a1bb3c69..7c21ee086 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -238,7 +238,16 @@ cdef class Matcher: matches.append((label, start, end)) else: partials.push_back(state + 1) - doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches + seen = set() + filtered = [] + for label, start, end in sorted(matches, key=lambda m: (m[1], -(m[1] - m[2]))): + if all(i in seen for i in range(start, end)): + continue + else: + for i in range(start, end): + seen.add(i) + filtered.append((label, start, end)) + doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + filtered return matches diff --git a/tests/matcher/test_matcher_bugfixes.py b/tests/matcher/test_matcher_bugfixes.py index b65541460..cf6944512 100644 --- a/tests/matcher/test_matcher_bugfixes.py +++ b/tests/matcher/test_matcher_bugfixes.py @@ -1,17 +1,102 @@ import pytest - from spacy.matcher import Matcher +from spacy.attrs import LOWER + -@pytest.mark.xfail def test_overlap_issue118(EN): '''Test a bug that arose from having overlapping matches''' doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') ORG = doc.vocab.strings['ORG'] - matcher = Matcher(EN.vocab, {'BostonCeltics': ('ORG', {}, [[{'lower': 'boston'}, {'lower': 'celtics'}], [{'lower': 'celtics'}]])}) + matcher = Matcher(EN.vocab, + {'BostonCeltics': + ('ORG', {}, + [ + [{LOWER: 'celtics'}], + [{LOWER: 'boston'}, {LOWER: 'celtics'}], + ] + ) + } + ) + assert len(list(doc.ents)) == 0 matches = matcher(doc) - assert matches == [(ORG, 9, 11)] + assert matches == [(ORG, 9, 11), (ORG, 10, 11)] + ents = list(doc.ents) + assert len(ents) == 1 + assert ents[0].label == ORG + assert ents[0].start == 9 + assert ents[0].end == 11 + + +def test_overlap_reorder(EN): + '''Test order dependence''' + doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') + ORG = doc.vocab.strings['ORG'] + matcher = Matcher(EN.vocab, + {'BostonCeltics': + ('ORG', {}, + [ + [{LOWER: 'boston'}, {LOWER: 'celtics'}], + [{LOWER: 'celtics'}], + ] + ) + } + ) + + assert len(list(doc.ents)) == 0 + matches = matcher(doc) + assert matches == [(ORG, 9, 11), (ORG, 10, 11)] + ents = list(doc.ents) + assert len(ents) == 1 + assert ents[0].label == ORG + assert ents[0].start == 9 + assert ents[0].end == 11 + + +def test_overlap_prefix(EN): + '''Test order dependence''' + doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') + ORG = doc.vocab.strings['ORG'] + matcher = Matcher(EN.vocab, + {'BostonCeltics': + ('ORG', {}, + [ + [{LOWER: 'boston'}], + [{LOWER: 'boston'}, {LOWER: 'celtics'}], + ] + ) + } + ) + + assert len(list(doc.ents)) == 0 + matches = matcher(doc) + assert matches == [(ORG, 9, 10), (ORG, 9, 11)] + ents = list(doc.ents) + assert len(ents) == 1 + assert ents[0].label == ORG + assert ents[0].start == 9 + assert ents[0].end == 11 + + +def test_overlap_prefix_reorder(EN): + '''Test order dependence''' + doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') + ORG = doc.vocab.strings['ORG'] + matcher = Matcher(EN.vocab, + {'BostonCeltics': + ('ORG', {}, + [ + [{LOWER: 'boston'}, {LOWER: 'celtics'}], + [{LOWER: 'boston'}], + ] + ) + } + ) + + assert len(list(doc.ents)) == 0 + matches = matcher(doc) + assert matches == [(ORG, 9, 10), (ORG, 9, 11)] ents = list(doc.ents) assert len(ents) == 1 assert ents[0].label == ORG