* Fix Issue #118: Matcher behaves unpredictably when matches overlap.

This commit is contained in:
Matthew Honnibal 2015-10-19 16:45:12 +11:00
parent 135062d23c
commit 6727a46bb5
2 changed files with 99 additions and 5 deletions

View File

@ -238,7 +238,16 @@ cdef class Matcher:
matches.append((label, start, end)) matches.append((label, start, end))
else: else:
partials.push_back(state + 1) partials.push_back(state + 1)
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches seen = set()
filtered = []
for label, start, end in sorted(matches, key=lambda m: (m[1], -(m[1] - m[2]))):
if all(i in seen for i in range(start, end)):
continue
else:
for i in range(start, end):
seen.add(i)
filtered.append((label, start, end))
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + filtered
return matches return matches

View File

@ -1,17 +1,102 @@
import pytest import pytest
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.attrs import LOWER
@pytest.mark.xfail
def test_overlap_issue118(EN): def test_overlap_issue118(EN):
'''Test a bug that arose from having overlapping matches''' '''Test a bug that arose from having overlapping matches'''
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
ORG = doc.vocab.strings['ORG'] ORG = doc.vocab.strings['ORG']
matcher = Matcher(EN.vocab, {'BostonCeltics': ('ORG', {}, [[{'lower': 'boston'}, {'lower': 'celtics'}], [{'lower': 'celtics'}]])}) matcher = Matcher(EN.vocab,
{'BostonCeltics':
('ORG', {},
[
[{LOWER: 'celtics'}],
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
]
)
}
)
assert len(list(doc.ents)) == 0
matches = matcher(doc) matches = matcher(doc)
assert matches == [(ORG, 9, 11)] assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
ents = list(doc.ents)
assert len(ents) == 1
assert ents[0].label == ORG
assert ents[0].start == 9
assert ents[0].end == 11
def test_overlap_reorder(EN):
'''Test order dependence'''
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
ORG = doc.vocab.strings['ORG']
matcher = Matcher(EN.vocab,
{'BostonCeltics':
('ORG', {},
[
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
[{LOWER: 'celtics'}],
]
)
}
)
assert len(list(doc.ents)) == 0
matches = matcher(doc)
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
ents = list(doc.ents)
assert len(ents) == 1
assert ents[0].label == ORG
assert ents[0].start == 9
assert ents[0].end == 11
def test_overlap_prefix(EN):
'''Test order dependence'''
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
ORG = doc.vocab.strings['ORG']
matcher = Matcher(EN.vocab,
{'BostonCeltics':
('ORG', {},
[
[{LOWER: 'boston'}],
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
]
)
}
)
assert len(list(doc.ents)) == 0
matches = matcher(doc)
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
ents = list(doc.ents)
assert len(ents) == 1
assert ents[0].label == ORG
assert ents[0].start == 9
assert ents[0].end == 11
def test_overlap_prefix_reorder(EN):
'''Test order dependence'''
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
ORG = doc.vocab.strings['ORG']
matcher = Matcher(EN.vocab,
{'BostonCeltics':
('ORG', {},
[
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
[{LOWER: 'boston'}],
]
)
}
)
assert len(list(doc.ents)) == 0
matches = matcher(doc)
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
ents = list(doc.ents) ents = list(doc.ents)
assert len(ents) == 1 assert len(ents) == 1
assert ents[0].label == ORG assert ents[0].label == ORG