mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* fix overflow error on windows * more documentation & logging fixes * md fix * 3 different limit parameters to play with execution time * bug fixes directory locations * small fixes * exclude dev test articles from prior probabilities stats * small fixes * filtering wikidata entities, removing numeric and meta items * adding aliases from wikidata also to the KB * fix adding WD aliases * adding also new aliases to previously added entities * fixing comma's * small doc fixes * adding subclassof filtering * append alias functionality in KB * prevent appending the same entity-alias pair * fix for appending WD aliases * remove date filter * remove unnecessary import * small corrections and reformatting * remove WD aliases for now (too slow) * removing numeric entities from training and evaluation * small fixes * shortcut during prediction if there is only one candidate * add counts and fscore logging, remove FP NER from evaluation * fix entity_linker.predict to take docs instead of single sentences * remove enumeration sentences from the WP dataset * entity_linker.update to process full doc instead of single sentence * spelling corrections and dump locations in readme * NLP IO fix * reading KB is unnecessary at the end of the pipeline * small logging fix * remove empty files
		
			
				
	
	
		
			129 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			129 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
# List of meta pages in Wikidata, should be kept out of the Knowledge base
 | 
						|
WD_META_ITEMS = [
 | 
						|
    "Q163875",
 | 
						|
    "Q191780",
 | 
						|
    "Q224414",
 | 
						|
    "Q4167836",
 | 
						|
    "Q4167410",
 | 
						|
    "Q4663903",
 | 
						|
    "Q11266439",
 | 
						|
    "Q13406463",
 | 
						|
    "Q15407973",
 | 
						|
    "Q18616576",
 | 
						|
    "Q19887878",
 | 
						|
    "Q22808320",
 | 
						|
    "Q23894233",
 | 
						|
    "Q33120876",
 | 
						|
    "Q42104522",
 | 
						|
    "Q47460393",
 | 
						|
    "Q64875536",
 | 
						|
    "Q66480449",
 | 
						|
]
 | 
						|
 | 
						|
 | 
						|
# TODO: add more cases from non-English WP's
 | 
						|
 | 
						|
# List of prefixes that refer to Wikipedia "file" pages
 | 
						|
WP_FILE_NAMESPACE = ["Bestand", "File"]
 | 
						|
 | 
						|
# List of prefixes that refer to Wikipedia "category" pages
 | 
						|
WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"]
 | 
						|
 | 
						|
# List of prefixes that refer to Wikipedia "meta" pages
 | 
						|
# these will/should be matched ignoring case
 | 
						|
WP_META_NAMESPACE = (
 | 
						|
    WP_FILE_NAMESPACE
 | 
						|
    + WP_CATEGORY_NAMESPACE
 | 
						|
    + [
 | 
						|
        "b",
 | 
						|
        "betawikiversity",
 | 
						|
        "Book",
 | 
						|
        "c",
 | 
						|
        "Commons",
 | 
						|
        "d",
 | 
						|
        "dbdump",
 | 
						|
        "download",
 | 
						|
        "Draft",
 | 
						|
        "Education",
 | 
						|
        "Foundation",
 | 
						|
        "Gadget",
 | 
						|
        "Gadget definition",
 | 
						|
        "Gebruiker",
 | 
						|
        "gerrit",
 | 
						|
        "Help",
 | 
						|
        "Image",
 | 
						|
        "Incubator",
 | 
						|
        "m",
 | 
						|
        "mail",
 | 
						|
        "mailarchive",
 | 
						|
        "media",
 | 
						|
        "MediaWiki",
 | 
						|
        "MediaWiki talk",
 | 
						|
        "Mediawikiwiki",
 | 
						|
        "MediaZilla",
 | 
						|
        "Meta",
 | 
						|
        "Metawikipedia",
 | 
						|
        "Module",
 | 
						|
        "mw",
 | 
						|
        "n",
 | 
						|
        "nost",
 | 
						|
        "oldwikisource",
 | 
						|
        "otrs",
 | 
						|
        "OTRSwiki",
 | 
						|
        "Overleg gebruiker",
 | 
						|
        "outreach",
 | 
						|
        "outreachwiki",
 | 
						|
        "Portal",
 | 
						|
        "phab",
 | 
						|
        "Phabricator",
 | 
						|
        "Project",
 | 
						|
        "q",
 | 
						|
        "quality",
 | 
						|
        "rev",
 | 
						|
        "s",
 | 
						|
        "spcom",
 | 
						|
        "Special",
 | 
						|
        "species",
 | 
						|
        "Strategy",
 | 
						|
        "sulutil",
 | 
						|
        "svn",
 | 
						|
        "Talk",
 | 
						|
        "Template",
 | 
						|
        "Template talk",
 | 
						|
        "Testwiki",
 | 
						|
        "ticket",
 | 
						|
        "TimedText",
 | 
						|
        "Toollabs",
 | 
						|
        "tools",
 | 
						|
        "tswiki",
 | 
						|
        "User",
 | 
						|
        "User talk",
 | 
						|
        "v",
 | 
						|
        "voy",
 | 
						|
        "w",
 | 
						|
        "Wikibooks",
 | 
						|
        "Wikidata",
 | 
						|
        "wikiHow",
 | 
						|
        "Wikinvest",
 | 
						|
        "wikilivres",
 | 
						|
        "Wikimedia",
 | 
						|
        "Wikinews",
 | 
						|
        "Wikipedia",
 | 
						|
        "Wikipedia talk",
 | 
						|
        "Wikiquote",
 | 
						|
        "Wikisource",
 | 
						|
        "Wikispecies",
 | 
						|
        "Wikitech",
 | 
						|
        "Wikiversity",
 | 
						|
        "Wikivoyage",
 | 
						|
        "wikt",
 | 
						|
        "wiktionary",
 | 
						|
        "wmf",
 | 
						|
        "wmania",
 | 
						|
        "WP",
 | 
						|
    ]
 | 
						|
)
 |