Fix get_matching_ents (#10451)

* Fix get_matching_ents

Not sure what happened here - the code prior to this commit simply does
not work. It's already covered by entity linker tests, which were
succeeding in the NEL PR, but couldn't possibly succeed on master.

* Fix test

Test was indented inside another test and so doesn't seem to have been
running properly.
This commit is contained in:
Paul O'Leary McCann 2022-03-08 00:56:57 +09:00 committed by GitHub
parent 7ed7908716
commit 61ba5450ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 56 additions and 56 deletions

View File

@ -1009,65 +1009,65 @@ def test_legacy_architectures(name, config):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
@pytest.mark.parametrize("patterns", [
# perfect case
[{"label": "CHARACTER", "pattern": "Kirby"}],
# typo for false negative
[{"label": "PERSON", "pattern": "Korby"}],
# random stuff for false positive
[{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
]
)
def test_no_gold_ents(patterns):
# test that annotating components work
TRAIN_DATA = [
(
"Kirby is pink",
{
"links": {(0, 5): {"Q613241": 1.0}},
"entities": [(0, 5, "CHARACTER")],
"sent_starts": [1, 0, 0],
},
)
]
nlp = English()
vector_length = 3
train_examples = []
for text, annotation in TRAIN_DATA:
doc = nlp(text)
train_examples.append(Example.from_dict(doc, annotation))
@pytest.mark.parametrize("patterns", [
# perfect case
[{"label": "CHARACTER", "pattern": "Kirby"}],
# typo for false negative
[{"label": "PERSON", "pattern": "Korby"}],
# random stuff for false positive
[{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
]
)
def test_no_gold_ents(patterns):
# test that annotating components work
TRAIN_DATA = [
(
"Kirby is pink",
{
"links": {(0, 5): {"Q613241": 1.0}},
"entities": [(0, 5, "CHARACTER")],
"sent_starts": [1, 0, 0],
},
)
]
nlp = English()
vector_length = 3
train_examples = []
for text, annotation in TRAIN_DATA:
doc = nlp(text)
train_examples.append(Example.from_dict(doc, annotation))
# Create a ruler to mark entities
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# Create a ruler to mark entities
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
# Apply ruler to examples. In a real pipeline this would be an annotating component.
for eg in train_examples:
eg.predicted = ruler(eg.predicted)
# Apply ruler to examples. In a real pipeline this would be an annotating component.
for eg in train_examples:
eg.predicted = ruler(eg.predicted)
def create_kb(vocab):
# create artificial KB
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Kirby", ["Q613241"], [0.9])
# Placeholder
mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
mykb.add_alias("pink", ["pink"], [0.9])
return mykb
def create_kb(vocab):
# create artificial KB
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Kirby", ["Q613241"], [0.9])
# Placeholder
mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
mykb.add_alias("pink", ["pink"], [0.9])
return mykb
# Create and train the Entity Linker
entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
entity_linker.set_kb(create_kb)
assert entity_linker.use_gold_ents == False
# Create and train the Entity Linker
entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
entity_linker.set_kb(create_kb)
assert entity_linker.use_gold_ents == False
optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(2):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(2):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
# adding additional components that are required for the entity_linker
nlp.add_pipe("sentencizer", first=True)
# adding additional components that are required for the entity_linker
nlp.add_pipe("sentencizer", first=True)
# this will run the pipeline on the examples and shouldn't crash
results = nlp.evaluate(train_examples)
# this will run the pipeline on the examples and shouldn't crash
results = nlp.evaluate(train_examples)

View File

@ -263,11 +263,11 @@ cdef class Example:
kept. Otherwise only the character indices need to match.
"""
gold = {}
for ent in self.reference:
for ent in self.reference.ents:
gold[(ent.start_char, ent.end_char)] = ent.label
keep = []
for ent in self.predicted:
for ent in self.predicted.ents:
key = (ent.start_char, ent.end_char)
if key not in gold:
continue