mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
2d249a9502
* fix overflow error on windows * more documentation & logging fixes * md fix * 3 different limit parameters to play with execution time * bug fixes directory locations * small fixes * exclude dev test articles from prior probabilities stats * small fixes * filtering wikidata entities, removing numeric and meta items * adding aliases from wikidata also to the KB * fix adding WD aliases * adding also new aliases to previously added entities * fixing comma's * small doc fixes * adding subclassof filtering * append alias functionality in KB * prevent appending the same entity-alias pair * fix for appending WD aliases * remove date filter * remove unnecessary import * small corrections and reformatting * remove WD aliases for now (too slow) * removing numeric entities from training and evaluation * small fixes * shortcut during prediction if there is only one candidate * add counts and fscore logging, remove FP NER from evaluation * fix entity_linker.predict to take docs instead of single sentences * remove enumeration sentences from the WP dataset * entity_linker.update to process full doc instead of single sentence * spelling corrections and dump locations in readme * NLP IO fix * reading KB is unnecessary at the end of the pipeline * small logging fix * remove empty files
128 lines
4.1 KiB
Python
128 lines
4.1 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import sys
|
|
import csv
|
|
|
|
# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/
|
|
csv.field_size_limit(min(sys.maxsize, 2147483646))
|
|
|
|
""" This class provides reading/writing methods for temp files """
|
|
|
|
|
|
# Entity definition: WP title -> WD ID #
|
|
def write_title_to_id(entity_def_output, title_to_id):
|
|
with entity_def_output.open("w", encoding="utf8") as id_file:
|
|
id_file.write("WP_title" + "|" + "WD_id" + "\n")
|
|
for title, qid in title_to_id.items():
|
|
id_file.write(title + "|" + str(qid) + "\n")
|
|
|
|
|
|
def read_title_to_id(entity_def_output):
|
|
title_to_id = dict()
|
|
with entity_def_output.open("r", encoding="utf8") as id_file:
|
|
csvreader = csv.reader(id_file, delimiter="|")
|
|
# skip header
|
|
next(csvreader)
|
|
for row in csvreader:
|
|
title_to_id[row[0]] = row[1]
|
|
return title_to_id
|
|
|
|
|
|
# Entity aliases from WD: WD ID -> WD alias #
|
|
def write_id_to_alias(entity_alias_path, id_to_alias):
|
|
with entity_alias_path.open("w", encoding="utf8") as alias_file:
|
|
alias_file.write("WD_id" + "|" + "alias" + "\n")
|
|
for qid, alias_list in id_to_alias.items():
|
|
for alias in alias_list:
|
|
alias_file.write(str(qid) + "|" + alias + "\n")
|
|
|
|
|
|
def read_id_to_alias(entity_alias_path):
|
|
id_to_alias = dict()
|
|
with entity_alias_path.open("r", encoding="utf8") as alias_file:
|
|
csvreader = csv.reader(alias_file, delimiter="|")
|
|
# skip header
|
|
next(csvreader)
|
|
for row in csvreader:
|
|
qid = row[0]
|
|
alias = row[1]
|
|
alias_list = id_to_alias.get(qid, [])
|
|
alias_list.append(alias)
|
|
id_to_alias[qid] = alias_list
|
|
return id_to_alias
|
|
|
|
|
|
def read_alias_to_id_generator(entity_alias_path):
|
|
""" Read (aliases, qid) tuples """
|
|
|
|
with entity_alias_path.open("r", encoding="utf8") as alias_file:
|
|
csvreader = csv.reader(alias_file, delimiter="|")
|
|
# skip header
|
|
next(csvreader)
|
|
for row in csvreader:
|
|
qid = row[0]
|
|
alias = row[1]
|
|
yield alias, qid
|
|
|
|
|
|
# Entity descriptions from WD: WD ID -> WD alias #
|
|
def write_id_to_descr(entity_descr_output, id_to_descr):
|
|
with entity_descr_output.open("w", encoding="utf8") as descr_file:
|
|
descr_file.write("WD_id" + "|" + "description" + "\n")
|
|
for qid, descr in id_to_descr.items():
|
|
descr_file.write(str(qid) + "|" + descr + "\n")
|
|
|
|
|
|
def read_id_to_descr(entity_desc_path):
|
|
id_to_desc = dict()
|
|
with entity_desc_path.open("r", encoding="utf8") as descr_file:
|
|
csvreader = csv.reader(descr_file, delimiter="|")
|
|
# skip header
|
|
next(csvreader)
|
|
for row in csvreader:
|
|
id_to_desc[row[0]] = row[1]
|
|
return id_to_desc
|
|
|
|
|
|
# Entity counts from WP: WP title -> count #
|
|
def write_entity_to_count(prior_prob_input, count_output):
|
|
# Write entity counts for quick access later
|
|
entity_to_count = dict()
|
|
total_count = 0
|
|
|
|
with prior_prob_input.open("r", encoding="utf8") as prior_file:
|
|
# skip header
|
|
prior_file.readline()
|
|
line = prior_file.readline()
|
|
|
|
while line:
|
|
splits = line.replace("\n", "").split(sep="|")
|
|
# alias = splits[0]
|
|
count = int(splits[1])
|
|
entity = splits[2]
|
|
|
|
current_count = entity_to_count.get(entity, 0)
|
|
entity_to_count[entity] = current_count + count
|
|
|
|
total_count += count
|
|
|
|
line = prior_file.readline()
|
|
|
|
with count_output.open("w", encoding="utf8") as entity_file:
|
|
entity_file.write("entity" + "|" + "count" + "\n")
|
|
for entity, count in entity_to_count.items():
|
|
entity_file.write(entity + "|" + str(count) + "\n")
|
|
|
|
|
|
def read_entity_to_count(count_input):
|
|
entity_to_count = dict()
|
|
with count_input.open("r", encoding="utf8") as csvfile:
|
|
csvreader = csv.reader(csvfile, delimiter="|")
|
|
# skip header
|
|
next(csvreader)
|
|
for row in csvreader:
|
|
entity_to_count[row[0]] = int(row[1])
|
|
|
|
return entity_to_count
|