mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* fix overflow error on windows * more documentation & logging fixes * md fix * 3 different limit parameters to play with execution time * bug fixes directory locations * small fixes * exclude dev test articles from prior probabilities stats * small fixes * filtering wikidata entities, removing numeric and meta items * adding aliases from wikidata also to the KB * fix adding WD aliases * adding also new aliases to previously added entities * fixing comma's * small doc fixes * adding subclassof filtering * append alias functionality in KB * prevent appending the same entity-alias pair * fix for appending WD aliases * remove date filter * remove unnecessary import * small corrections and reformatting * remove WD aliases for now (too slow) * removing numeric entities from training and evaluation * small fixes * shortcut during prediction if there is only one candidate * add counts and fscore logging, remove FP NER from evaluation * fix entity_linker.predict to take docs instead of single sentences * remove enumeration sentences from the WP dataset * entity_linker.update to process full doc instead of single sentence * spelling corrections and dump locations in readme * NLP IO fix * reading KB is unnecessary at the end of the pipeline * small logging fix * remove empty files
		
			
				
	
	
		
			128 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			128 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf-8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
import sys
 | 
						|
import csv
 | 
						|
 | 
						|
# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/
 | 
						|
csv.field_size_limit(min(sys.maxsize, 2147483646))
 | 
						|
 | 
						|
""" This class provides reading/writing methods for temp files """
 | 
						|
 | 
						|
 | 
						|
# Entity definition: WP title -> WD ID #
 | 
						|
def write_title_to_id(entity_def_output, title_to_id):
 | 
						|
    with entity_def_output.open("w", encoding="utf8") as id_file:
 | 
						|
        id_file.write("WP_title" + "|" + "WD_id" + "\n")
 | 
						|
        for title, qid in title_to_id.items():
 | 
						|
            id_file.write(title + "|" + str(qid) + "\n")
 | 
						|
 | 
						|
 | 
						|
def read_title_to_id(entity_def_output):
 | 
						|
    title_to_id = dict()
 | 
						|
    with entity_def_output.open("r", encoding="utf8") as id_file:
 | 
						|
        csvreader = csv.reader(id_file, delimiter="|")
 | 
						|
        # skip header
 | 
						|
        next(csvreader)
 | 
						|
        for row in csvreader:
 | 
						|
            title_to_id[row[0]] = row[1]
 | 
						|
    return title_to_id
 | 
						|
 | 
						|
 | 
						|
# Entity aliases from WD: WD ID -> WD alias #
 | 
						|
def write_id_to_alias(entity_alias_path, id_to_alias):
 | 
						|
    with entity_alias_path.open("w", encoding="utf8") as alias_file:
 | 
						|
        alias_file.write("WD_id" + "|" + "alias" + "\n")
 | 
						|
        for qid, alias_list in id_to_alias.items():
 | 
						|
            for alias in alias_list:
 | 
						|
                alias_file.write(str(qid) + "|" + alias + "\n")
 | 
						|
 | 
						|
 | 
						|
def read_id_to_alias(entity_alias_path):
 | 
						|
    id_to_alias = dict()
 | 
						|
    with entity_alias_path.open("r", encoding="utf8") as alias_file:
 | 
						|
        csvreader = csv.reader(alias_file, delimiter="|")
 | 
						|
        # skip header
 | 
						|
        next(csvreader)
 | 
						|
        for row in csvreader:
 | 
						|
            qid = row[0]
 | 
						|
            alias = row[1]
 | 
						|
            alias_list = id_to_alias.get(qid, [])
 | 
						|
            alias_list.append(alias)
 | 
						|
            id_to_alias[qid] = alias_list
 | 
						|
    return id_to_alias
 | 
						|
 | 
						|
 | 
						|
def read_alias_to_id_generator(entity_alias_path):
 | 
						|
    """ Read (aliases, qid) tuples """
 | 
						|
 | 
						|
    with entity_alias_path.open("r", encoding="utf8") as alias_file:
 | 
						|
        csvreader = csv.reader(alias_file, delimiter="|")
 | 
						|
        # skip header
 | 
						|
        next(csvreader)
 | 
						|
        for row in csvreader:
 | 
						|
            qid = row[0]
 | 
						|
            alias = row[1]
 | 
						|
            yield alias, qid
 | 
						|
 | 
						|
 | 
						|
# Entity descriptions from WD: WD ID -> WD alias #
 | 
						|
def write_id_to_descr(entity_descr_output, id_to_descr):
 | 
						|
    with entity_descr_output.open("w", encoding="utf8") as descr_file:
 | 
						|
        descr_file.write("WD_id" + "|" + "description" + "\n")
 | 
						|
        for qid, descr in id_to_descr.items():
 | 
						|
            descr_file.write(str(qid) + "|" + descr + "\n")
 | 
						|
 | 
						|
 | 
						|
def read_id_to_descr(entity_desc_path):
 | 
						|
    id_to_desc = dict()
 | 
						|
    with entity_desc_path.open("r", encoding="utf8") as descr_file:
 | 
						|
        csvreader = csv.reader(descr_file, delimiter="|")
 | 
						|
        # skip header
 | 
						|
        next(csvreader)
 | 
						|
        for row in csvreader:
 | 
						|
            id_to_desc[row[0]] = row[1]
 | 
						|
    return id_to_desc
 | 
						|
 | 
						|
 | 
						|
# Entity counts from WP: WP title -> count #
 | 
						|
def write_entity_to_count(prior_prob_input, count_output):
 | 
						|
    # Write entity counts for quick access later
 | 
						|
    entity_to_count = dict()
 | 
						|
    total_count = 0
 | 
						|
 | 
						|
    with prior_prob_input.open("r", encoding="utf8") as prior_file:
 | 
						|
        # skip header
 | 
						|
        prior_file.readline()
 | 
						|
        line = prior_file.readline()
 | 
						|
 | 
						|
        while line:
 | 
						|
            splits = line.replace("\n", "").split(sep="|")
 | 
						|
            # alias = splits[0]
 | 
						|
            count = int(splits[1])
 | 
						|
            entity = splits[2]
 | 
						|
 | 
						|
            current_count = entity_to_count.get(entity, 0)
 | 
						|
            entity_to_count[entity] = current_count + count
 | 
						|
 | 
						|
            total_count += count
 | 
						|
 | 
						|
            line = prior_file.readline()
 | 
						|
 | 
						|
    with count_output.open("w", encoding="utf8") as entity_file:
 | 
						|
        entity_file.write("entity" + "|" + "count" + "\n")
 | 
						|
        for entity, count in entity_to_count.items():
 | 
						|
            entity_file.write(entity + "|" + str(count) + "\n")
 | 
						|
 | 
						|
 | 
						|
def read_entity_to_count(count_input):
 | 
						|
    entity_to_count = dict()
 | 
						|
    with count_input.open("r", encoding="utf8") as csvfile:
 | 
						|
        csvreader = csv.reader(csvfile, delimiter="|")
 | 
						|
        # skip header
 | 
						|
        next(csvreader)
 | 
						|
        for row in csvreader:
 | 
						|
            entity_to_count[row[0]] = int(row[1])
 | 
						|
 | 
						|
    return entity_to_count
 |