mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
* Fix Issue #118: Matcher behaves unpredictably when matches overlap.
This commit is contained in:
parent
135062d23c
commit
6727a46bb5
|
@ -238,7 +238,16 @@ cdef class Matcher:
|
||||||
matches.append((label, start, end))
|
matches.append((label, start, end))
|
||||||
else:
|
else:
|
||||||
partials.push_back(state + 1)
|
partials.push_back(state + 1)
|
||||||
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches
|
seen = set()
|
||||||
|
filtered = []
|
||||||
|
for label, start, end in sorted(matches, key=lambda m: (m[1], -(m[1] - m[2]))):
|
||||||
|
if all(i in seen for i in range(start, end)):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
for i in range(start, end):
|
||||||
|
seen.add(i)
|
||||||
|
filtered.append((label, start, end))
|
||||||
|
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + filtered
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,102 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.attrs import LOWER
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_overlap_issue118(EN):
|
def test_overlap_issue118(EN):
|
||||||
'''Test a bug that arose from having overlapping matches'''
|
'''Test a bug that arose from having overlapping matches'''
|
||||||
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
||||||
ORG = doc.vocab.strings['ORG']
|
ORG = doc.vocab.strings['ORG']
|
||||||
matcher = Matcher(EN.vocab, {'BostonCeltics': ('ORG', {}, [[{'lower': 'boston'}, {'lower': 'celtics'}], [{'lower': 'celtics'}]])})
|
matcher = Matcher(EN.vocab,
|
||||||
|
{'BostonCeltics':
|
||||||
|
('ORG', {},
|
||||||
|
[
|
||||||
|
[{LOWER: 'celtics'}],
|
||||||
|
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(list(doc.ents)) == 0
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert matches == [(ORG, 9, 11)]
|
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
||||||
|
ents = list(doc.ents)
|
||||||
|
assert len(ents) == 1
|
||||||
|
assert ents[0].label == ORG
|
||||||
|
assert ents[0].start == 9
|
||||||
|
assert ents[0].end == 11
|
||||||
|
|
||||||
|
|
||||||
|
def test_overlap_reorder(EN):
|
||||||
|
'''Test order dependence'''
|
||||||
|
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
||||||
|
ORG = doc.vocab.strings['ORG']
|
||||||
|
matcher = Matcher(EN.vocab,
|
||||||
|
{'BostonCeltics':
|
||||||
|
('ORG', {},
|
||||||
|
[
|
||||||
|
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
|
||||||
|
[{LOWER: 'celtics'}],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(list(doc.ents)) == 0
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
||||||
|
ents = list(doc.ents)
|
||||||
|
assert len(ents) == 1
|
||||||
|
assert ents[0].label == ORG
|
||||||
|
assert ents[0].start == 9
|
||||||
|
assert ents[0].end == 11
|
||||||
|
|
||||||
|
|
||||||
|
def test_overlap_prefix(EN):
|
||||||
|
'''Test order dependence'''
|
||||||
|
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
||||||
|
ORG = doc.vocab.strings['ORG']
|
||||||
|
matcher = Matcher(EN.vocab,
|
||||||
|
{'BostonCeltics':
|
||||||
|
('ORG', {},
|
||||||
|
[
|
||||||
|
[{LOWER: 'boston'}],
|
||||||
|
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(list(doc.ents)) == 0
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
||||||
|
ents = list(doc.ents)
|
||||||
|
assert len(ents) == 1
|
||||||
|
assert ents[0].label == ORG
|
||||||
|
assert ents[0].start == 9
|
||||||
|
assert ents[0].end == 11
|
||||||
|
|
||||||
|
|
||||||
|
def test_overlap_prefix_reorder(EN):
|
||||||
|
'''Test order dependence'''
|
||||||
|
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
||||||
|
ORG = doc.vocab.strings['ORG']
|
||||||
|
matcher = Matcher(EN.vocab,
|
||||||
|
{'BostonCeltics':
|
||||||
|
('ORG', {},
|
||||||
|
[
|
||||||
|
[{LOWER: 'boston'}, {LOWER: 'celtics'}],
|
||||||
|
[{LOWER: 'boston'}],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(list(doc.ents)) == 0
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
||||||
ents = list(doc.ents)
|
ents = list(doc.ents)
|
||||||
assert len(ents) == 1
|
assert len(ents) == 1
|
||||||
assert ents[0].label == ORG
|
assert ents[0].label == ORG
|
||||||
|
|
Loading…
Reference in New Issue
Block a user