mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
2d249a9502
* fix overflow error on windows * more documentation & logging fixes * md fix * 3 different limit parameters to play with execution time * bug fixes directory locations * small fixes * exclude dev test articles from prior probabilities stats * small fixes * filtering wikidata entities, removing numeric and meta items * adding aliases from wikidata also to the KB * fix adding WD aliases * adding also new aliases to previously added entities * fixing comma's * small doc fixes * adding subclassof filtering * append alias functionality in KB * prevent appending the same entity-alias pair * fix for appending WD aliases * remove date filter * remove unnecessary import * small corrections and reformatting * remove WD aliases for now (too slow) * removing numeric entities from training and evaluation * small fixes * shortcut during prediction if there is only one candidate * add counts and fscore logging, remove FP NER from evaluation * fix entity_linker.predict to take docs instead of single sentences * remove enumeration sentences from the WP dataset * entity_linker.update to process full doc instead of single sentence * spelling corrections and dump locations in readme * NLP IO fix * reading KB is unnecessary at the end of the pipeline * small logging fix * remove empty files
129 lines
2.5 KiB
Python
129 lines
2.5 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
# List of meta pages in Wikidata, should be kept out of the Knowledge base
|
|
WD_META_ITEMS = [
|
|
"Q163875",
|
|
"Q191780",
|
|
"Q224414",
|
|
"Q4167836",
|
|
"Q4167410",
|
|
"Q4663903",
|
|
"Q11266439",
|
|
"Q13406463",
|
|
"Q15407973",
|
|
"Q18616576",
|
|
"Q19887878",
|
|
"Q22808320",
|
|
"Q23894233",
|
|
"Q33120876",
|
|
"Q42104522",
|
|
"Q47460393",
|
|
"Q64875536",
|
|
"Q66480449",
|
|
]
|
|
|
|
|
|
# TODO: add more cases from non-English WP's
|
|
|
|
# List of prefixes that refer to Wikipedia "file" pages
|
|
WP_FILE_NAMESPACE = ["Bestand", "File"]
|
|
|
|
# List of prefixes that refer to Wikipedia "category" pages
|
|
WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"]
|
|
|
|
# List of prefixes that refer to Wikipedia "meta" pages
|
|
# these will/should be matched ignoring case
|
|
WP_META_NAMESPACE = (
|
|
WP_FILE_NAMESPACE
|
|
+ WP_CATEGORY_NAMESPACE
|
|
+ [
|
|
"b",
|
|
"betawikiversity",
|
|
"Book",
|
|
"c",
|
|
"Commons",
|
|
"d",
|
|
"dbdump",
|
|
"download",
|
|
"Draft",
|
|
"Education",
|
|
"Foundation",
|
|
"Gadget",
|
|
"Gadget definition",
|
|
"Gebruiker",
|
|
"gerrit",
|
|
"Help",
|
|
"Image",
|
|
"Incubator",
|
|
"m",
|
|
"mail",
|
|
"mailarchive",
|
|
"media",
|
|
"MediaWiki",
|
|
"MediaWiki talk",
|
|
"Mediawikiwiki",
|
|
"MediaZilla",
|
|
"Meta",
|
|
"Metawikipedia",
|
|
"Module",
|
|
"mw",
|
|
"n",
|
|
"nost",
|
|
"oldwikisource",
|
|
"otrs",
|
|
"OTRSwiki",
|
|
"Overleg gebruiker",
|
|
"outreach",
|
|
"outreachwiki",
|
|
"Portal",
|
|
"phab",
|
|
"Phabricator",
|
|
"Project",
|
|
"q",
|
|
"quality",
|
|
"rev",
|
|
"s",
|
|
"spcom",
|
|
"Special",
|
|
"species",
|
|
"Strategy",
|
|
"sulutil",
|
|
"svn",
|
|
"Talk",
|
|
"Template",
|
|
"Template talk",
|
|
"Testwiki",
|
|
"ticket",
|
|
"TimedText",
|
|
"Toollabs",
|
|
"tools",
|
|
"tswiki",
|
|
"User",
|
|
"User talk",
|
|
"v",
|
|
"voy",
|
|
"w",
|
|
"Wikibooks",
|
|
"Wikidata",
|
|
"wikiHow",
|
|
"Wikinvest",
|
|
"wikilivres",
|
|
"Wikimedia",
|
|
"Wikinews",
|
|
"Wikipedia",
|
|
"Wikipedia talk",
|
|
"Wikiquote",
|
|
"Wikisource",
|
|
"Wikispecies",
|
|
"Wikitech",
|
|
"Wikiversity",
|
|
"Wikivoyage",
|
|
"wikt",
|
|
"wiktionary",
|
|
"wmf",
|
|
"wmania",
|
|
"WP",
|
|
]
|
|
)
|