mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-24 12:41:23 +03:00
baseline evaluation using highest-freq candidate
This commit is contained in:
parent
6961215578
commit
7e348d7f7f
|
@ -112,26 +112,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
|
|||
if to_print:
|
||||
print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
|
||||
|
||||
|
||||
def test_kb(kb):
|
||||
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
|
||||
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
|
||||
nlp.add_pipe(el_pipe, last=True)
|
||||
|
||||
candidates = kb.get_candidates("Bush")
|
||||
|
||||
print("generating candidates for 'Bush' :")
|
||||
for c in candidates:
|
||||
print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
|
||||
print()
|
||||
|
||||
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
||||
"Douglas reminds us to always bring our towel. " \
|
||||
"The main character in Doug's novel is the man Arthur Dent, " \
|
||||
"but Douglas doesn't write about George Washington or Homer Simpson."
|
||||
doc = nlp(text)
|
||||
|
||||
for ent in doc.ents:
|
||||
print("ent", ent.text, ent.label_, ent.kb_id_)
|
||||
|
|
|
@ -1,12 +1,113 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import spacy
|
||||
import datetime
|
||||
from os import listdir
|
||||
|
||||
from examples.pipeline.wiki_entity_linking import training_set_creator
|
||||
|
||||
# requires: pip install neuralcoref --no-binary neuralcoref
|
||||
# import neuralcoref
|
||||
|
||||
|
||||
def run_el_toy_example(nlp, kb):
|
||||
_prepare_pipeline(nlp, kb)
|
||||
|
||||
candidates = kb.get_candidates("Bush")
|
||||
|
||||
print("generating candidates for 'Bush' :")
|
||||
for c in candidates:
|
||||
print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
|
||||
print()
|
||||
|
||||
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
||||
"Douglas reminds us to always bring our towel. " \
|
||||
"The main character in Doug's novel is the man Arthur Dent, " \
|
||||
"but Douglas doesn't write about George Washington or Homer Simpson."
|
||||
doc = nlp(text)
|
||||
|
||||
for ent in doc.ents:
|
||||
print("ent", ent.text, ent.label_, ent.kb_id_)
|
||||
|
||||
|
||||
def run_el_training(nlp, kb, training_dir, limit=None):
|
||||
_prepare_pipeline(nlp, kb)
|
||||
|
||||
correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir,
|
||||
collect_correct=True,
|
||||
collect_incorrect=False)
|
||||
|
||||
predictions = list()
|
||||
golds = list()
|
||||
|
||||
cnt = 0
|
||||
for f in listdir(training_dir):
|
||||
if not limit or cnt < limit:
|
||||
if is_dev(f):
|
||||
article_id = f.replace(".txt", "")
|
||||
if cnt % 500 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
|
||||
cnt += 1
|
||||
with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
|
||||
text = file.read()
|
||||
doc = nlp(text)
|
||||
for ent in doc.ents:
|
||||
if ent.label_ == "PERSON": # TODO: expand to other types
|
||||
gold_entity = correct_entries_per_article[article_id].get(ent.text, None)
|
||||
# only evaluating gold entities we know, because the training data is not complete
|
||||
if gold_entity:
|
||||
predictions.append(ent.kb_id_)
|
||||
golds.append(gold_entity)
|
||||
|
||||
print("Processed", cnt, "dev articles")
|
||||
print()
|
||||
evaluate(predictions, golds)
|
||||
|
||||
|
||||
def is_dev(file_name):
|
||||
return file_name.endswith("3.txt")
|
||||
|
||||
|
||||
def evaluate(predictions, golds):
|
||||
if len(predictions) != len(golds):
|
||||
raise ValueError("predictions and gold entities should have the same length")
|
||||
|
||||
print("Evaluating", len(golds), "entities")
|
||||
|
||||
tp = 0
|
||||
fp = 0
|
||||
fn = 0
|
||||
|
||||
for pred, gold in zip(predictions, golds):
|
||||
is_correct = pred == gold
|
||||
if not pred:
|
||||
fn += 1
|
||||
elif is_correct:
|
||||
tp += 1
|
||||
else:
|
||||
fp += 1
|
||||
|
||||
print("tp", tp)
|
||||
print("fp", fp)
|
||||
print("fn", fn)
|
||||
|
||||
precision = tp / (tp + fp + 0.0000001)
|
||||
recall = tp / (tp + fn + 0.0000001)
|
||||
fscore = 2 * recall * precision / (recall + precision + 0.0000001)
|
||||
|
||||
print("precision", round(100 * precision, 1), "%")
|
||||
print("recall", round(100 * recall, 1), "%")
|
||||
print("Fscore", round(100 * fscore, 1), "%")
|
||||
|
||||
|
||||
def _prepare_pipeline(nlp, kb):
|
||||
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
|
||||
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
|
||||
nlp.add_pipe(el_pipe, last=True)
|
||||
|
||||
|
||||
# TODO
|
||||
def add_coref():
|
||||
""" Add coreference resolution to our model """
|
||||
|
|
|
@ -12,6 +12,7 @@ from . import wikipedia_processor as wp
|
|||
Process Wikipedia interlinks to generate a training dataset for the EL algorithm
|
||||
"""
|
||||
|
||||
ENTITY_FILE = "gold_entities.csv"
|
||||
|
||||
def create_training(kb, entity_input, training_output):
|
||||
if not kb:
|
||||
|
@ -44,7 +45,7 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
|
|||
|
||||
read_ids = set()
|
||||
|
||||
entityfile_loc = training_output + "/" + "gold_entities.csv"
|
||||
entityfile_loc = training_output + "/" + ENTITY_FILE
|
||||
with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
|
||||
# write entity training header file
|
||||
_write_training_entity(outputfile=entityfile,
|
||||
|
@ -274,3 +275,36 @@ def _write_training_article(article_id, clean_text, training_output):
|
|||
|
||||
def _write_training_entity(outputfile, article_id, alias, entity, correct):
|
||||
outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
|
||||
|
||||
|
||||
def read_training_entities(training_output, collect_correct=True, collect_incorrect=False):
|
||||
entityfile_loc = training_output + "/" + ENTITY_FILE
|
||||
incorrect_entries_per_article = dict()
|
||||
correct_entries_per_article = dict()
|
||||
with open(entityfile_loc, mode='r', encoding='utf8') as file:
|
||||
for line in file:
|
||||
fields = line.replace('\n', "").split(sep='|')
|
||||
article_id = fields[0]
|
||||
alias = fields[1]
|
||||
entity = fields[2]
|
||||
correct = fields[3]
|
||||
|
||||
if correct == "1" and collect_correct:
|
||||
entry_dict = correct_entries_per_article.get(article_id, dict())
|
||||
if alias in entry_dict:
|
||||
raise ValueError("Found alias", alias, "multiple times for article", article_id, "in", ENTITY_FILE)
|
||||
entry_dict[alias] = entity
|
||||
correct_entries_per_article[article_id] = entry_dict
|
||||
|
||||
if correct == "0" and collect_incorrect:
|
||||
entry_dict = incorrect_entries_per_article.get(article_id, dict())
|
||||
entities = entry_dict.get(alias, set())
|
||||
entities.add(entity)
|
||||
entry_dict[alias] = entities
|
||||
incorrect_entries_per_article[article_id] = entry_dict
|
||||
|
||||
return correct_entries_per_article, incorrect_entries_per_article
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from . import wikipedia_processor as wp, kb_creator, training_set_creator
|
||||
from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
|
||||
|
||||
import spacy
|
||||
from spacy.vocab import Vocab
|
||||
|
@ -19,8 +19,7 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
|
|||
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
||||
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
|
||||
|
||||
TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
|
||||
TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
|
||||
TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -37,8 +36,12 @@ if __name__ == "__main__":
|
|||
to_read_kb = True
|
||||
to_test_kb = False
|
||||
|
||||
# create training dataset
|
||||
create_wp_training = False
|
||||
|
||||
# apply named entity linking to the training dataset
|
||||
apply_to_training = True
|
||||
|
||||
# STEP 1 : create prior probabilities from WP
|
||||
# run only once !
|
||||
if to_create_prior_probs:
|
||||
|
@ -88,13 +91,21 @@ if __name__ == "__main__":
|
|||
|
||||
# test KB
|
||||
if to_test_kb:
|
||||
kb_creator.test_kb(my_kb)
|
||||
my_nlp = spacy.load('en_core_web_sm')
|
||||
run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp)
|
||||
print()
|
||||
|
||||
# STEP 5: create a training dataset from WP
|
||||
if create_wp_training:
|
||||
print("STEP 5: create training dataset", datetime.datetime.now())
|
||||
training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR)
|
||||
training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR)
|
||||
|
||||
# STEP 6: apply the EL algorithm on the training dataset
|
||||
if apply_to_training:
|
||||
my_nlp = spacy.load('en_core_web_sm')
|
||||
run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000)
|
||||
print()
|
||||
|
||||
|
||||
# TODO coreference resolution
|
||||
# add_coref()
|
||||
|
|
Loading…
Reference in New Issue
Block a user