mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-02 23:03:41 +03:00
baseline evaluation using highest-freq candidate
This commit is contained in:
parent
6961215578
commit
7e348d7f7f
|
@ -112,26 +112,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
|
||||||
if to_print:
|
if to_print:
|
||||||
print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
|
print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
|
||||||
|
|
||||||
|
|
||||||
def test_kb(kb):
|
|
||||||
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
|
|
||||||
nlp = spacy.load('en_core_web_sm')
|
|
||||||
|
|
||||||
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
|
|
||||||
nlp.add_pipe(el_pipe, last=True)
|
|
||||||
|
|
||||||
candidates = kb.get_candidates("Bush")
|
|
||||||
|
|
||||||
print("generating candidates for 'Bush' :")
|
|
||||||
for c in candidates:
|
|
||||||
print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
|
|
||||||
print()
|
|
||||||
|
|
||||||
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
|
||||||
"Douglas reminds us to always bring our towel. " \
|
|
||||||
"The main character in Doug's novel is the man Arthur Dent, " \
|
|
||||||
"but Douglas doesn't write about George Washington or Homer Simpson."
|
|
||||||
doc = nlp(text)
|
|
||||||
|
|
||||||
for ent in doc.ents:
|
|
||||||
print("ent", ent.text, ent.label_, ent.kb_id_)
|
|
||||||
|
|
|
@ -1,12 +1,113 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import os
|
||||||
import spacy
|
import spacy
|
||||||
|
import datetime
|
||||||
|
from os import listdir
|
||||||
|
|
||||||
|
from examples.pipeline.wiki_entity_linking import training_set_creator
|
||||||
|
|
||||||
# requires: pip install neuralcoref --no-binary neuralcoref
|
# requires: pip install neuralcoref --no-binary neuralcoref
|
||||||
# import neuralcoref
|
# import neuralcoref
|
||||||
|
|
||||||
|
|
||||||
|
def run_el_toy_example(nlp, kb):
|
||||||
|
_prepare_pipeline(nlp, kb)
|
||||||
|
|
||||||
|
candidates = kb.get_candidates("Bush")
|
||||||
|
|
||||||
|
print("generating candidates for 'Bush' :")
|
||||||
|
for c in candidates:
|
||||||
|
print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
|
||||||
|
print()
|
||||||
|
|
||||||
|
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
||||||
|
"Douglas reminds us to always bring our towel. " \
|
||||||
|
"The main character in Doug's novel is the man Arthur Dent, " \
|
||||||
|
"but Douglas doesn't write about George Washington or Homer Simpson."
|
||||||
|
doc = nlp(text)
|
||||||
|
|
||||||
|
for ent in doc.ents:
|
||||||
|
print("ent", ent.text, ent.label_, ent.kb_id_)
|
||||||
|
|
||||||
|
|
||||||
|
def run_el_training(nlp, kb, training_dir, limit=None):
|
||||||
|
_prepare_pipeline(nlp, kb)
|
||||||
|
|
||||||
|
correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir,
|
||||||
|
collect_correct=True,
|
||||||
|
collect_incorrect=False)
|
||||||
|
|
||||||
|
predictions = list()
|
||||||
|
golds = list()
|
||||||
|
|
||||||
|
cnt = 0
|
||||||
|
for f in listdir(training_dir):
|
||||||
|
if not limit or cnt < limit:
|
||||||
|
if is_dev(f):
|
||||||
|
article_id = f.replace(".txt", "")
|
||||||
|
if cnt % 500 == 0:
|
||||||
|
print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
|
||||||
|
cnt += 1
|
||||||
|
with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
|
||||||
|
text = file.read()
|
||||||
|
doc = nlp(text)
|
||||||
|
for ent in doc.ents:
|
||||||
|
if ent.label_ == "PERSON": # TODO: expand to other types
|
||||||
|
gold_entity = correct_entries_per_article[article_id].get(ent.text, None)
|
||||||
|
# only evaluating gold entities we know, because the training data is not complete
|
||||||
|
if gold_entity:
|
||||||
|
predictions.append(ent.kb_id_)
|
||||||
|
golds.append(gold_entity)
|
||||||
|
|
||||||
|
print("Processed", cnt, "dev articles")
|
||||||
|
print()
|
||||||
|
evaluate(predictions, golds)
|
||||||
|
|
||||||
|
|
||||||
|
def is_dev(file_name):
|
||||||
|
return file_name.endswith("3.txt")
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(predictions, golds):
|
||||||
|
if len(predictions) != len(golds):
|
||||||
|
raise ValueError("predictions and gold entities should have the same length")
|
||||||
|
|
||||||
|
print("Evaluating", len(golds), "entities")
|
||||||
|
|
||||||
|
tp = 0
|
||||||
|
fp = 0
|
||||||
|
fn = 0
|
||||||
|
|
||||||
|
for pred, gold in zip(predictions, golds):
|
||||||
|
is_correct = pred == gold
|
||||||
|
if not pred:
|
||||||
|
fn += 1
|
||||||
|
elif is_correct:
|
||||||
|
tp += 1
|
||||||
|
else:
|
||||||
|
fp += 1
|
||||||
|
|
||||||
|
print("tp", tp)
|
||||||
|
print("fp", fp)
|
||||||
|
print("fn", fn)
|
||||||
|
|
||||||
|
precision = tp / (tp + fp + 0.0000001)
|
||||||
|
recall = tp / (tp + fn + 0.0000001)
|
||||||
|
fscore = 2 * recall * precision / (recall + precision + 0.0000001)
|
||||||
|
|
||||||
|
print("precision", round(100 * precision, 1), "%")
|
||||||
|
print("recall", round(100 * recall, 1), "%")
|
||||||
|
print("Fscore", round(100 * fscore, 1), "%")
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_pipeline(nlp, kb):
|
||||||
|
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
|
||||||
|
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
|
||||||
|
nlp.add_pipe(el_pipe, last=True)
|
||||||
|
|
||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
def add_coref():
|
def add_coref():
|
||||||
""" Add coreference resolution to our model """
|
""" Add coreference resolution to our model """
|
||||||
|
|
|
@ -12,6 +12,7 @@ from . import wikipedia_processor as wp
|
||||||
Process Wikipedia interlinks to generate a training dataset for the EL algorithm
|
Process Wikipedia interlinks to generate a training dataset for the EL algorithm
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
ENTITY_FILE = "gold_entities.csv"
|
||||||
|
|
||||||
def create_training(kb, entity_input, training_output):
|
def create_training(kb, entity_input, training_output):
|
||||||
if not kb:
|
if not kb:
|
||||||
|
@ -44,7 +45,7 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
|
||||||
|
|
||||||
read_ids = set()
|
read_ids = set()
|
||||||
|
|
||||||
entityfile_loc = training_output + "/" + "gold_entities.csv"
|
entityfile_loc = training_output + "/" + ENTITY_FILE
|
||||||
with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
|
with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
|
||||||
# write entity training header file
|
# write entity training header file
|
||||||
_write_training_entity(outputfile=entityfile,
|
_write_training_entity(outputfile=entityfile,
|
||||||
|
@ -274,3 +275,36 @@ def _write_training_article(article_id, clean_text, training_output):
|
||||||
|
|
||||||
def _write_training_entity(outputfile, article_id, alias, entity, correct):
|
def _write_training_entity(outputfile, article_id, alias, entity, correct):
|
||||||
outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
|
outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def read_training_entities(training_output, collect_correct=True, collect_incorrect=False):
|
||||||
|
entityfile_loc = training_output + "/" + ENTITY_FILE
|
||||||
|
incorrect_entries_per_article = dict()
|
||||||
|
correct_entries_per_article = dict()
|
||||||
|
with open(entityfile_loc, mode='r', encoding='utf8') as file:
|
||||||
|
for line in file:
|
||||||
|
fields = line.replace('\n', "").split(sep='|')
|
||||||
|
article_id = fields[0]
|
||||||
|
alias = fields[1]
|
||||||
|
entity = fields[2]
|
||||||
|
correct = fields[3]
|
||||||
|
|
||||||
|
if correct == "1" and collect_correct:
|
||||||
|
entry_dict = correct_entries_per_article.get(article_id, dict())
|
||||||
|
if alias in entry_dict:
|
||||||
|
raise ValueError("Found alias", alias, "multiple times for article", article_id, "in", ENTITY_FILE)
|
||||||
|
entry_dict[alias] = entity
|
||||||
|
correct_entries_per_article[article_id] = entry_dict
|
||||||
|
|
||||||
|
if correct == "0" and collect_incorrect:
|
||||||
|
entry_dict = incorrect_entries_per_article.get(article_id, dict())
|
||||||
|
entities = entry_dict.get(alias, set())
|
||||||
|
entities.add(entity)
|
||||||
|
entry_dict[alias] = entities
|
||||||
|
incorrect_entries_per_article[article_id] = entry_dict
|
||||||
|
|
||||||
|
return correct_entries_per_article, incorrect_entries_per_article
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from . import wikipedia_processor as wp, kb_creator, training_set_creator
|
from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
@ -19,8 +19,7 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
|
||||||
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
||||||
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
|
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
|
||||||
|
|
||||||
TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
|
TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
|
||||||
TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -37,8 +36,12 @@ if __name__ == "__main__":
|
||||||
to_read_kb = True
|
to_read_kb = True
|
||||||
to_test_kb = False
|
to_test_kb = False
|
||||||
|
|
||||||
|
# create training dataset
|
||||||
create_wp_training = False
|
create_wp_training = False
|
||||||
|
|
||||||
|
# apply named entity linking to the training dataset
|
||||||
|
apply_to_training = True
|
||||||
|
|
||||||
# STEP 1 : create prior probabilities from WP
|
# STEP 1 : create prior probabilities from WP
|
||||||
# run only once !
|
# run only once !
|
||||||
if to_create_prior_probs:
|
if to_create_prior_probs:
|
||||||
|
@ -88,13 +91,21 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# test KB
|
# test KB
|
||||||
if to_test_kb:
|
if to_test_kb:
|
||||||
kb_creator.test_kb(my_kb)
|
my_nlp = spacy.load('en_core_web_sm')
|
||||||
|
run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# STEP 5: create a training dataset from WP
|
# STEP 5: create a training dataset from WP
|
||||||
if create_wp_training:
|
if create_wp_training:
|
||||||
print("STEP 5: create training dataset", datetime.datetime.now())
|
print("STEP 5: create training dataset", datetime.datetime.now())
|
||||||
training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR)
|
training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR)
|
||||||
|
|
||||||
|
# STEP 6: apply the EL algorithm on the training dataset
|
||||||
|
if apply_to_training:
|
||||||
|
my_nlp = spacy.load('en_core_web_sm')
|
||||||
|
run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
# TODO coreference resolution
|
# TODO coreference resolution
|
||||||
# add_coref()
|
# add_coref()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user