mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
* Fix Issue #118: Matcher behaves unpredictably when matches overlap.
This commit is contained in:
parent
135062d23c
commit
6727a46bb5
|
@ -238,7 +238,16 @@ cdef class Matcher:
|
|||
matches.append((label, start, end))
|
||||
else:
|
||||
partials.push_back(state + 1)
|
||||
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches
|
||||
seen = set()
|
||||
filtered = []
|
||||
for label, start, end in sorted(matches, key=lambda m: (m[1], -(m[1] - m[2]))):
|
||||
if all(i in seen for i in range(start, end)):
|
||||
continue
|
||||
else:
|
||||
for i in range(start, end):
|
||||
seen.add(i)
|
||||
filtered.append((label, start, end))
|
||||
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + filtered
|
||||
return matches
|
||||
|
||||
|
||||
|
|
|
@ -1,17 +1,102 @@
|
|||
import pytest
|
||||
|
||||
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import LOWER
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_overlap_issue118(EN):
|
||||
'''Test a bug that arose from having overlapping matches'''
|
||||
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
||||
ORG = doc.vocab.strings['ORG']
|
||||
matcher = Matcher(EN.vocab, {'BostonCeltics': ('ORG', {}, [[{'lower': 'boston'}, {'lower': 'celtics'}], [{'lower': 'celtics'}]])})
|
||||
matcher = Matcher(EN.vocab,
|
||||
{'BostonCeltics':
|
||||
('ORG', {},
|
||||
[
|
||||
[{LOWER: 'celtics'}],
|
||||
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
|
||||
]
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
assert len(list(doc.ents)) == 0
|
||||
matches = matcher(doc)
|
||||
assert matches == [(ORG, 9, 11)]
|
||||
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
||||
ents = list(doc.ents)
|
||||
assert len(ents) == 1
|
||||
assert ents[0].label == ORG
|
||||
assert ents[0].start == 9
|
||||
assert ents[0].end == 11
|
||||
|
||||
|
||||
def test_overlap_reorder(EN):
|
||||
'''Test order dependence'''
|
||||
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
||||
ORG = doc.vocab.strings['ORG']
|
||||
matcher = Matcher(EN.vocab,
|
||||
{'BostonCeltics':
|
||||
('ORG', {},
|
||||
[
|
||||
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
|
||||
[{LOWER: 'celtics'}],
|
||||
]
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
assert len(list(doc.ents)) == 0
|
||||
matches = matcher(doc)
|
||||
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
||||
ents = list(doc.ents)
|
||||
assert len(ents) == 1
|
||||
assert ents[0].label == ORG
|
||||
assert ents[0].start == 9
|
||||
assert ents[0].end == 11
|
||||
|
||||
|
||||
def test_overlap_prefix(EN):
|
||||
'''Test order dependence'''
|
||||
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
||||
ORG = doc.vocab.strings['ORG']
|
||||
matcher = Matcher(EN.vocab,
|
||||
{'BostonCeltics':
|
||||
('ORG', {},
|
||||
[
|
||||
[{LOWER: 'boston'}],
|
||||
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
|
||||
]
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
assert len(list(doc.ents)) == 0
|
||||
matches = matcher(doc)
|
||||
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
||||
ents = list(doc.ents)
|
||||
assert len(ents) == 1
|
||||
assert ents[0].label == ORG
|
||||
assert ents[0].start == 9
|
||||
assert ents[0].end == 11
|
||||
|
||||
|
||||
def test_overlap_prefix_reorder(EN):
|
||||
'''Test order dependence'''
|
||||
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
||||
ORG = doc.vocab.strings['ORG']
|
||||
matcher = Matcher(EN.vocab,
|
||||
{'BostonCeltics':
|
||||
('ORG', {},
|
||||
[
|
||||
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
|
||||
[{LOWER: 'boston'}],
|
||||
]
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
assert len(list(doc.ents)) == 0
|
||||
matches = matcher(doc)
|
||||
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
||||
ents = list(doc.ents)
|
||||
assert len(ents) == 1
|
||||
assert ents[0].label == ORG
|
||||
|
|
Loading…
Reference in New Issue
Block a user