mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
parse wp dump for links to determine prior probabilities
This commit is contained in:
parent
3163331b1e
commit
6763e025e1
|
@ -6,9 +6,27 @@ from __future__ import unicode_literals
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import spacy
|
import spacy
|
||||||
|
import datetime
|
||||||
import bz2
|
import bz2
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
|
|
||||||
|
# these will/should be matched ignoring case
|
||||||
|
wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
|
||||||
|
"d", "dbdump", "download", "Draft", "Education", "Foundation",
|
||||||
|
"Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator",
|
||||||
|
"m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki",
|
||||||
|
"MediaZilla", "Meta", "Metawikipedia", "Module",
|
||||||
|
"mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
|
||||||
|
"Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
|
||||||
|
"s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
|
||||||
|
"Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki",
|
||||||
|
"User", "User talk", "v", "voy",
|
||||||
|
"w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
|
||||||
|
"Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
|
||||||
|
"Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
|
||||||
|
|
||||||
|
map_alias_to_link = dict()
|
||||||
|
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = KnowledgeBase(vocab=vocab)
|
kb = KnowledgeBase(vocab=vocab)
|
||||||
|
@ -38,7 +56,7 @@ def _read_wikidata():
|
||||||
with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
|
with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
|
||||||
line = file.readline()
|
line = file.readline()
|
||||||
cnt = 1
|
cnt = 1
|
||||||
while line and cnt < 10:
|
while line and cnt < 100000:
|
||||||
clean_line = line.strip()
|
clean_line = line.strip()
|
||||||
if clean_line.endswith(b","):
|
if clean_line.endswith(b","):
|
||||||
clean_line = clean_line[:-1]
|
clean_line = clean_line[:-1]
|
||||||
|
@ -91,6 +109,78 @@ def _read_wikidata():
|
||||||
cnt += 1
|
cnt += 1
|
||||||
|
|
||||||
|
|
||||||
|
def _read_wikipedia_prior_probs():
|
||||||
|
""" Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """
|
||||||
|
|
||||||
|
# find the links
|
||||||
|
link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
|
||||||
|
|
||||||
|
# match on interwiki links, e.g. `en:` or `:fr:`
|
||||||
|
ns_regex = r":?" + "[a-z][a-z]" + ":"
|
||||||
|
|
||||||
|
# match on Namespace: optionally preceded by a :
|
||||||
|
for ns in wiki_namespaces:
|
||||||
|
ns_regex += "|" + ":?" + ns + ":"
|
||||||
|
|
||||||
|
ns_regex = re.compile(ns_regex, re.IGNORECASE)
|
||||||
|
|
||||||
|
# TODO remove hardcoded path
|
||||||
|
with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file:
|
||||||
|
line = file.readline()
|
||||||
|
cnt = 0
|
||||||
|
while line:
|
||||||
|
if cnt % 5000000 == 0:
|
||||||
|
print(datetime.datetime.now(), "processed", cnt, "lines")
|
||||||
|
clean_line = line.strip().decode("utf-8")
|
||||||
|
|
||||||
|
matches = link_regex.findall(clean_line)
|
||||||
|
for match in matches:
|
||||||
|
match = match[2:][:-2].replace("_", " ").strip()
|
||||||
|
|
||||||
|
if ns_regex.match(match):
|
||||||
|
pass # ignore namespaces at the beginning of the string
|
||||||
|
|
||||||
|
# this is a simple link, with the alias the same as the mention
|
||||||
|
elif "|" not in match:
|
||||||
|
_store_alias(match, match)
|
||||||
|
|
||||||
|
# in wiki format, the link is written as [[entity|alias]]
|
||||||
|
else:
|
||||||
|
splits = match.split("|")
|
||||||
|
entity = splits[0].strip()
|
||||||
|
alias = splits[1].strip()
|
||||||
|
# specific wiki format [[alias (specification)|]]
|
||||||
|
if len(alias) == 0 and "(" in entity:
|
||||||
|
alias = entity.split("(")[0]
|
||||||
|
_store_alias(alias, entity)
|
||||||
|
else:
|
||||||
|
_store_alias(alias, entity)
|
||||||
|
|
||||||
|
line = file.readline()
|
||||||
|
cnt += 1
|
||||||
|
|
||||||
|
# only print aliases with more than one potential entity
|
||||||
|
# TODO remove hardcoded path
|
||||||
|
with open('C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv', mode='w', encoding='utf8') as outputfile:
|
||||||
|
outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
|
||||||
|
for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
|
||||||
|
for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
|
||||||
|
outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def _store_alias(alias, entity):
|
||||||
|
alias = alias.strip()
|
||||||
|
entity = entity.strip()
|
||||||
|
|
||||||
|
# remove everything after # as this is not part of the title but refers to a specific paragraph
|
||||||
|
clean_entity = entity.split("#")[0].capitalize()
|
||||||
|
|
||||||
|
if len(alias) > 0 and len(clean_entity) > 0:
|
||||||
|
alias_dict = map_alias_to_link.get(alias, dict())
|
||||||
|
entity_count = alias_dict.get(clean_entity, 0)
|
||||||
|
alias_dict[clean_entity] = entity_count + 1
|
||||||
|
map_alias_to_link[alias] = alias_dict
|
||||||
|
|
||||||
def _read_wikipedia():
|
def _read_wikipedia():
|
||||||
""" Read the XML wikipedia data """
|
""" Read the XML wikipedia data """
|
||||||
# TODO remove hardcoded path
|
# TODO remove hardcoded path
|
||||||
|
@ -103,7 +193,7 @@ def _read_wikipedia():
|
||||||
article_title = None
|
article_title = None
|
||||||
article_id = None
|
article_id = None
|
||||||
reading_text = False
|
reading_text = False
|
||||||
while line and cnt < 10000:
|
while line and cnt < 1000000:
|
||||||
clean_line = line.strip().decode("utf-8")
|
clean_line = line.strip().decode("utf-8")
|
||||||
|
|
||||||
# Start reading new page
|
# Start reading new page
|
||||||
|
@ -143,28 +233,51 @@ def _read_wikipedia():
|
||||||
|
|
||||||
|
|
||||||
def _store_wp_article(article_id, article_title, article_text):
|
def _store_wp_article(article_id, article_title, article_text):
|
||||||
|
pass
|
||||||
print("WP article", article_id, ":", article_title)
|
print("WP article", article_id, ":", article_title)
|
||||||
print(article_text)
|
print(article_text)
|
||||||
print(_get_clean_wp_text(article_text))
|
print(_get_clean_wp_text(article_text))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _get_clean_wp_text(article_text):
|
def _get_clean_wp_text(article_text):
|
||||||
# remove category statements
|
# TODO: compile the regular expressions
|
||||||
clean_text = re.sub('\[\[Category:.*\]\]', '', article_text)
|
|
||||||
|
# remove Category and File statements
|
||||||
|
clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', article_text)
|
||||||
|
print("1", clean_text)
|
||||||
|
clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text) # TODO: this doesn't work yet
|
||||||
|
print("2", clean_text)
|
||||||
|
|
||||||
|
# remove bolding markup
|
||||||
|
clean_text = re.sub('\'\'\'', '', clean_text)
|
||||||
|
clean_text = re.sub('\'\'', '', clean_text)
|
||||||
|
|
||||||
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
|
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
|
||||||
try_again = True
|
try_again = True
|
||||||
previous_length = len(clean_text)
|
previous_length = len(clean_text)
|
||||||
while try_again:
|
while try_again:
|
||||||
clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match
|
clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match excluding a nested {
|
||||||
print(clean_text)
|
|
||||||
if len(clean_text) < previous_length:
|
if len(clean_text) < previous_length:
|
||||||
try_again = True
|
try_again = True
|
||||||
else:
|
else:
|
||||||
try_again = False
|
try_again = False
|
||||||
previous_length = len(clean_text)
|
previous_length = len(clean_text)
|
||||||
|
|
||||||
|
# remove multiple spaces
|
||||||
|
while ' ' in clean_text:
|
||||||
|
clean_text = re.sub(' ', ' ', clean_text)
|
||||||
|
|
||||||
|
# remove simple interwiki links (no alternative name)
|
||||||
|
clean_text = re.sub('\[\[([^|]*?)]]', r'\1', clean_text)
|
||||||
|
|
||||||
|
# remove simple interwiki links by picking the alternative name
|
||||||
|
clean_text = re.sub(r'\[\[[^|]*?\|([^|]*?)]]', r'\1', clean_text)
|
||||||
|
|
||||||
|
# remove HTML comments
|
||||||
|
clean_text = re.sub('<!--[^!]*-->', '', clean_text)
|
||||||
|
|
||||||
return clean_text
|
return clean_text
|
||||||
|
|
||||||
|
|
||||||
|
@ -187,6 +300,13 @@ def add_el(kb, nlp):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
nlp = spacy.load('en_core_web_sm')
|
_read_wikipedia_prior_probs()
|
||||||
my_kb = create_kb(nlp.vocab)
|
|
||||||
|
# nlp = spacy.load('en_core_web_sm')
|
||||||
|
# my_kb = create_kb(nlp.vocab)
|
||||||
# add_el(my_kb, nlp)
|
# add_el(my_kb, nlp)
|
||||||
|
|
||||||
|
# clean_text = "[[File:smomething]] jhk"
|
||||||
|
# clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', clean_text)
|
||||||
|
# clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text)
|
||||||
|
# print(clean_text)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user