mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Fix get_matching_ents (#10451)
* Fix get_matching_ents Not sure what happened here - the code prior to this commit simply does not work. It's already covered by entity linker tests, which were succeeding in the NEL PR, but couldn't possibly succeed on master. * Fix test Test was indented inside another test and so doesn't seem to have been running properly.
This commit is contained in:
parent
7ed7908716
commit
61ba5450ff
|
@ -1009,65 +1009,65 @@ def test_legacy_architectures(name, config):
|
|||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
|
||||
@pytest.mark.parametrize("patterns", [
|
||||
# perfect case
|
||||
[{"label": "CHARACTER", "pattern": "Kirby"}],
|
||||
# typo for false negative
|
||||
[{"label": "PERSON", "pattern": "Korby"}],
|
||||
# random stuff for false positive
|
||||
[{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
|
||||
]
|
||||
)
|
||||
def test_no_gold_ents(patterns):
|
||||
# test that annotating components work
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
"Kirby is pink",
|
||||
{
|
||||
"links": {(0, 5): {"Q613241": 1.0}},
|
||||
"entities": [(0, 5, "CHARACTER")],
|
||||
"sent_starts": [1, 0, 0],
|
||||
},
|
||||
)
|
||||
]
|
||||
nlp = English()
|
||||
vector_length = 3
|
||||
train_examples = []
|
||||
for text, annotation in TRAIN_DATA:
|
||||
doc = nlp(text)
|
||||
train_examples.append(Example.from_dict(doc, annotation))
|
||||
@pytest.mark.parametrize("patterns", [
|
||||
# perfect case
|
||||
[{"label": "CHARACTER", "pattern": "Kirby"}],
|
||||
# typo for false negative
|
||||
[{"label": "PERSON", "pattern": "Korby"}],
|
||||
# random stuff for false positive
|
||||
[{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
|
||||
]
|
||||
)
|
||||
def test_no_gold_ents(patterns):
|
||||
# test that annotating components work
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
"Kirby is pink",
|
||||
{
|
||||
"links": {(0, 5): {"Q613241": 1.0}},
|
||||
"entities": [(0, 5, "CHARACTER")],
|
||||
"sent_starts": [1, 0, 0],
|
||||
},
|
||||
)
|
||||
]
|
||||
nlp = English()
|
||||
vector_length = 3
|
||||
train_examples = []
|
||||
for text, annotation in TRAIN_DATA:
|
||||
doc = nlp(text)
|
||||
train_examples.append(Example.from_dict(doc, annotation))
|
||||
|
||||
# Create a ruler to mark entities
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
# Create a ruler to mark entities
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
|
||||
# Apply ruler to examples. In a real pipeline this would be an annotating component.
|
||||
for eg in train_examples:
|
||||
eg.predicted = ruler(eg.predicted)
|
||||
# Apply ruler to examples. In a real pipeline this would be an annotating component.
|
||||
for eg in train_examples:
|
||||
eg.predicted = ruler(eg.predicted)
|
||||
|
||||
def create_kb(vocab):
|
||||
# create artificial KB
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
||||
# Placeholder
|
||||
mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
|
||||
mykb.add_alias("pink", ["pink"], [0.9])
|
||||
return mykb
|
||||
def create_kb(vocab):
|
||||
# create artificial KB
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
||||
# Placeholder
|
||||
mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
|
||||
mykb.add_alias("pink", ["pink"], [0.9])
|
||||
return mykb
|
||||
|
||||
|
||||
# Create and train the Entity Linker
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
|
||||
entity_linker.set_kb(create_kb)
|
||||
assert entity_linker.use_gold_ents == False
|
||||
# Create and train the Entity Linker
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
|
||||
entity_linker.set_kb(create_kb)
|
||||
assert entity_linker.use_gold_ents == False
|
||||
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
for i in range(2):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
for i in range(2):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
|
||||
# adding additional components that are required for the entity_linker
|
||||
nlp.add_pipe("sentencizer", first=True)
|
||||
# adding additional components that are required for the entity_linker
|
||||
nlp.add_pipe("sentencizer", first=True)
|
||||
|
||||
# this will run the pipeline on the examples and shouldn't crash
|
||||
results = nlp.evaluate(train_examples)
|
||||
# this will run the pipeline on the examples and shouldn't crash
|
||||
results = nlp.evaluate(train_examples)
|
||||
|
|
|
@ -263,11 +263,11 @@ cdef class Example:
|
|||
kept. Otherwise only the character indices need to match.
|
||||
"""
|
||||
gold = {}
|
||||
for ent in self.reference:
|
||||
for ent in self.reference.ents:
|
||||
gold[(ent.start_char, ent.end_char)] = ent.label
|
||||
|
||||
keep = []
|
||||
for ent in self.predicted:
|
||||
for ent in self.predicted.ents:
|
||||
key = (ent.start_char, ent.end_char)
|
||||
if key not in gold:
|
||||
continue
|
||||
|
|
Loading…
Reference in New Issue
Block a user