mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Fix get_matching_ents (#10451)
* Fix get_matching_ents Not sure what happened here - the code prior to this commit simply does not work. It's already covered by entity linker tests, which were succeeding in the NEL PR, but couldn't possibly succeed on master. * Fix test Test was indented inside another test and so doesn't seem to have been running properly.
This commit is contained in:
parent
7ed7908716
commit
61ba5450ff
|
@ -1009,65 +1009,65 @@ def test_legacy_architectures(name, config):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
@pytest.mark.parametrize("patterns", [
|
@pytest.mark.parametrize("patterns", [
|
||||||
# perfect case
|
# perfect case
|
||||||
[{"label": "CHARACTER", "pattern": "Kirby"}],
|
[{"label": "CHARACTER", "pattern": "Kirby"}],
|
||||||
# typo for false negative
|
# typo for false negative
|
||||||
[{"label": "PERSON", "pattern": "Korby"}],
|
[{"label": "PERSON", "pattern": "Korby"}],
|
||||||
# random stuff for false positive
|
# random stuff for false positive
|
||||||
[{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
|
[{"label": "IS", "pattern": "is"}, {"label": "COLOR", "pattern": "pink"}],
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_no_gold_ents(patterns):
|
def test_no_gold_ents(patterns):
|
||||||
# test that annotating components work
|
# test that annotating components work
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
(
|
(
|
||||||
"Kirby is pink",
|
"Kirby is pink",
|
||||||
{
|
{
|
||||||
"links": {(0, 5): {"Q613241": 1.0}},
|
"links": {(0, 5): {"Q613241": 1.0}},
|
||||||
"entities": [(0, 5, "CHARACTER")],
|
"entities": [(0, 5, "CHARACTER")],
|
||||||
"sent_starts": [1, 0, 0],
|
"sent_starts": [1, 0, 0],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
nlp = English()
|
nlp = English()
|
||||||
vector_length = 3
|
vector_length = 3
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotation in TRAIN_DATA:
|
for text, annotation in TRAIN_DATA:
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
train_examples.append(Example.from_dict(doc, annotation))
|
train_examples.append(Example.from_dict(doc, annotation))
|
||||||
|
|
||||||
# Create a ruler to mark entities
|
# Create a ruler to mark entities
|
||||||
ruler = nlp.add_pipe("entity_ruler")
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
ruler.add_patterns(patterns)
|
ruler.add_patterns(patterns)
|
||||||
|
|
||||||
# Apply ruler to examples. In a real pipeline this would be an annotating component.
|
# Apply ruler to examples. In a real pipeline this would be an annotating component.
|
||||||
for eg in train_examples:
|
for eg in train_examples:
|
||||||
eg.predicted = ruler(eg.predicted)
|
eg.predicted = ruler(eg.predicted)
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
# create artificial KB
|
# create artificial KB
|
||||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||||
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
||||||
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
||||||
# Placeholder
|
# Placeholder
|
||||||
mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
|
mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5])
|
||||||
mykb.add_alias("pink", ["pink"], [0.9])
|
mykb.add_alias("pink", ["pink"], [0.9])
|
||||||
return mykb
|
return mykb
|
||||||
|
|
||||||
|
|
||||||
# Create and train the Entity Linker
|
# Create and train the Entity Linker
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
|
entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True)
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb)
|
||||||
assert entity_linker.use_gold_ents == False
|
assert entity_linker.use_gold_ents == False
|
||||||
|
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
for i in range(2):
|
for i in range(2):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
# adding additional components that are required for the entity_linker
|
# adding additional components that are required for the entity_linker
|
||||||
nlp.add_pipe("sentencizer", first=True)
|
nlp.add_pipe("sentencizer", first=True)
|
||||||
|
|
||||||
# this will run the pipeline on the examples and shouldn't crash
|
# this will run the pipeline on the examples and shouldn't crash
|
||||||
results = nlp.evaluate(train_examples)
|
results = nlp.evaluate(train_examples)
|
||||||
|
|
|
@ -263,11 +263,11 @@ cdef class Example:
|
||||||
kept. Otherwise only the character indices need to match.
|
kept. Otherwise only the character indices need to match.
|
||||||
"""
|
"""
|
||||||
gold = {}
|
gold = {}
|
||||||
for ent in self.reference:
|
for ent in self.reference.ents:
|
||||||
gold[(ent.start_char, ent.end_char)] = ent.label
|
gold[(ent.start_char, ent.end_char)] = ent.label
|
||||||
|
|
||||||
keep = []
|
keep = []
|
||||||
for ent in self.predicted:
|
for ent in self.predicted.ents:
|
||||||
key = (ent.start_char, ent.end_char)
|
key = (ent.start_char, ent.end_char)
|
||||||
if key not in gold:
|
if key not in gold:
|
||||||
continue
|
continue
|
||||||
|
|
Loading…
Reference in New Issue
Block a user