Fix get_matching_ents (#10451)

* Fix get_matching_ents

Not sure what happened here - the code prior to this commit simply does
not work. It's already covered by entity linker tests, which were
succeeding in the NEL PR, but couldn't possibly succeed on master.

* Fix test

Test was indented inside another test and so doesn't seem to have been
running properly.
This commit is contained in:
Paul O'Leary McCann 2022-03-08 00:56:57 +09:00 committed by GitHub
parent 7ed7908716
commit 61ba5450ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 56 additions and 56 deletions

View File

@ -1009,65 +1009,65 @@ def test_legacy_architectures(name, config):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)
@pytest.mark.parametrize("patterns", [ @pytest.mark.parametrize("patterns", [
# perfect case # perfect case
[{"label": "CHARACTER", "pattern": "Kirby"}], [{"label": "CHARACTER", "pattern": "Kirby"}],
# typo for false negative # typo for false negative
[{"label": "PERSON", "pattern": "Korby"}], [{"label": "PERSON", "pattern": "Korby"}],
# random stuff for false positive # random stuff for false positive
[{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}], [{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
] ]
) )
def test_no_gold_ents(patterns): def test_no_gold_ents(patterns):
# test that annotating components work # test that annotating components work
TRAIN_DATA = [ TRAIN_DATA = [
( (
"Kirby is pink", "Kirby is pink",
{ {
"links": {(0, 5): {"Q613241": 1.0}}, "links": {(0, 5): {"Q613241": 1.0}},
"entities": [(0, 5, "CHARACTER")], "entities": [(0, 5, "CHARACTER")],
"sent_starts": [1, 0, 0], "sent_starts": [1, 0, 0],
}, },
) )
] ]
nlp = English() nlp = English()
vector_length = 3 vector_length = 3
train_examples = [] train_examples = []
for text, annotation in TRAIN_DATA: for text, annotation in TRAIN_DATA:
doc = nlp(text) doc = nlp(text)
train_examples.append(Example.from_dict(doc, annotation)) train_examples.append(Example.from_dict(doc, annotation))
# Create a ruler to mark entities # Create a ruler to mark entities
ruler = nlp.add_pipe("entity_ruler") ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
# Apply ruler to examples. In a real pipeline this would be an annotating component. # Apply ruler to examples. In a real pipeline this would be an annotating component.
for eg in train_examples: for eg in train_examples:
eg.predicted = ruler(eg.predicted) eg.predicted = ruler(eg.predicted)
def create_kb(vocab): def create_kb(vocab):
# create artificial KB # create artificial KB
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Kirby", ["Q613241"], [0.9]) mykb.add_alias("Kirby", ["Q613241"], [0.9])
# Placeholder # Placeholder
mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5]) mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
mykb.add_alias("pink", ["pink"], [0.9]) mykb.add_alias("pink", ["pink"], [0.9])
return mykb return mykb
# Create and train the Entity Linker # Create and train the Entity Linker
entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True) entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
entity_linker.set_kb(create_kb) entity_linker.set_kb(create_kb)
assert entity_linker.use_gold_ents == False assert entity_linker.use_gold_ents == False
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(2): for i in range(2):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)
# adding additional components that are required for the entity_linker # adding additional components that are required for the entity_linker
nlp.add_pipe("sentencizer", first=True) nlp.add_pipe("sentencizer", first=True)
# this will run the pipeline on the examples and shouldn't crash # this will run the pipeline on the examples and shouldn't crash
results = nlp.evaluate(train_examples) results = nlp.evaluate(train_examples)

View File

@ -263,11 +263,11 @@ cdef class Example:
kept. Otherwise only the character indices need to match. kept. Otherwise only the character indices need to match.
""" """
gold = {} gold = {}
for ent in self.reference: for ent in self.reference.ents:
gold[(ent.start_char, ent.end_char)] = ent.label gold[(ent.start_char, ent.end_char)] = ent.label
keep = [] keep = []
for ent in self.predicted: for ent in self.predicted.ents:
key = (ent.start_char, ent.end_char) key = (ent.start_char, ent.end_char)
if key not in gold: if key not in gold:
continue continue