mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 05:07:03 +03:00
parsing clean text from WP articles to use as input data for NER and NEL
This commit is contained in:
parent
8353552191
commit
581dc9742d
|
@ -10,9 +10,13 @@ import json
|
||||||
import spacy
|
import spacy
|
||||||
import datetime
|
import datetime
|
||||||
import bz2
|
import bz2
|
||||||
|
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
# requires: pip install neuralcoref --no-binary neuralcoref
|
||||||
|
# import neuralcoref
|
||||||
|
|
||||||
# TODO: remove hardcoded paths
|
# TODO: remove hardcoded paths
|
||||||
WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
|
WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
|
||||||
ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
|
ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
|
||||||
|
@ -20,6 +24,7 @@ ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-ar
|
||||||
|
|
||||||
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
||||||
ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
|
ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
|
||||||
|
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
|
||||||
|
|
||||||
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
||||||
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
|
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
|
||||||
|
@ -43,7 +48,151 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
|
||||||
map_alias_to_link = dict()
|
map_alias_to_link = dict()
|
||||||
|
|
||||||
|
|
||||||
def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
|
def read_wikipedia_prior_probs():
|
||||||
|
"""
|
||||||
|
STEP 1: Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
|
||||||
|
The full file takes about 2h to parse 1100M lines (update printed every 5M lines).
|
||||||
|
It works relatively fast because we don't care about which article we parsed the interwiki from,
|
||||||
|
we just process line by line.
|
||||||
|
"""
|
||||||
|
|
||||||
|
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
|
||||||
|
line = file.readline()
|
||||||
|
cnt = 0
|
||||||
|
while line:
|
||||||
|
if cnt % 5000000 == 0:
|
||||||
|
print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
|
||||||
|
clean_line = line.strip().decode("utf-8")
|
||||||
|
|
||||||
|
aliases, entities, normalizations = _get_wp_links(clean_line)
|
||||||
|
for alias, entity, norm in zip(aliases, entities, normalizations):
|
||||||
|
_store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
|
||||||
|
_store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
|
||||||
|
|
||||||
|
line = file.readline()
|
||||||
|
cnt += 1
|
||||||
|
|
||||||
|
# write all aliases and their entities and occurrences to file
|
||||||
|
with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile:
|
||||||
|
outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
|
||||||
|
for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
|
||||||
|
for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
|
||||||
|
outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
# find the links
|
||||||
|
link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
|
||||||
|
|
||||||
|
# match on interwiki links, e.g. `en:` or `:fr:`
|
||||||
|
ns_regex = r":?" + "[a-z][a-z]" + ":"
|
||||||
|
|
||||||
|
# match on Namespace: optionally preceded by a :
|
||||||
|
for ns in wiki_namespaces:
|
||||||
|
ns_regex += "|" + ":?" + ns + ":"
|
||||||
|
|
||||||
|
ns_regex = re.compile(ns_regex, re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_wp_links(text):
|
||||||
|
aliases = []
|
||||||
|
entities = []
|
||||||
|
normalizations = []
|
||||||
|
|
||||||
|
matches = link_regex.findall(text)
|
||||||
|
for match in matches:
|
||||||
|
match = match[2:][:-2].replace("_", " ").strip()
|
||||||
|
|
||||||
|
if ns_regex.match(match):
|
||||||
|
pass # ignore namespaces at the beginning of the string
|
||||||
|
|
||||||
|
# this is a simple link, with the alias the same as the mention
|
||||||
|
elif "|" not in match:
|
||||||
|
aliases.append(match)
|
||||||
|
entities.append(match)
|
||||||
|
normalizations.append(True)
|
||||||
|
|
||||||
|
# in wiki format, the link is written as [[entity|alias]]
|
||||||
|
else:
|
||||||
|
splits = match.split("|")
|
||||||
|
entity = splits[0].strip()
|
||||||
|
alias = splits[1].strip()
|
||||||
|
# specific wiki format [[alias (specification)|]]
|
||||||
|
if len(alias) == 0 and "(" in entity:
|
||||||
|
alias = entity.split("(")[0]
|
||||||
|
aliases.append(alias)
|
||||||
|
entities.append(entity)
|
||||||
|
normalizations.append(False)
|
||||||
|
else:
|
||||||
|
aliases.append(alias)
|
||||||
|
entities.append(entity)
|
||||||
|
normalizations.append(False)
|
||||||
|
|
||||||
|
return aliases, entities, normalizations
|
||||||
|
|
||||||
|
|
||||||
|
def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
|
||||||
|
alias = alias.strip()
|
||||||
|
entity = entity.strip()
|
||||||
|
|
||||||
|
# remove everything after # as this is not part of the title but refers to a specific paragraph
|
||||||
|
if normalize_entity:
|
||||||
|
# wikipedia titles are always capitalized
|
||||||
|
entity = _capitalize_first(entity.split("#")[0])
|
||||||
|
if normalize_alias:
|
||||||
|
alias = alias.split("#")[0]
|
||||||
|
|
||||||
|
if alias and entity:
|
||||||
|
alias_dict = map_alias_to_link.get(alias, dict())
|
||||||
|
entity_count = alias_dict.get(entity, 0)
|
||||||
|
alias_dict[entity] = entity_count + 1
|
||||||
|
map_alias_to_link[alias] = alias_dict
|
||||||
|
|
||||||
|
|
||||||
|
def _capitalize_first(text):
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
result = text[0].capitalize()
|
||||||
|
if len(result) > 0:
|
||||||
|
result += text[1:]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def write_entity_counts(to_print=False):
|
||||||
|
""" STEP 2: write entity counts """
|
||||||
|
entity_to_count = dict()
|
||||||
|
total_count = 0
|
||||||
|
|
||||||
|
with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
|
||||||
|
# skip header
|
||||||
|
prior_file.readline()
|
||||||
|
line = prior_file.readline()
|
||||||
|
|
||||||
|
while line:
|
||||||
|
splits = line.replace('\n', "").split(sep='|')
|
||||||
|
# alias = splits[0]
|
||||||
|
count = int(splits[1])
|
||||||
|
entity = splits[2]
|
||||||
|
|
||||||
|
current_count = entity_to_count.get(entity, 0)
|
||||||
|
entity_to_count[entity] = current_count + count
|
||||||
|
|
||||||
|
total_count += count
|
||||||
|
|
||||||
|
line = prior_file.readline()
|
||||||
|
|
||||||
|
with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file:
|
||||||
|
entity_file.write("entity" + "|" + "count" + "\n")
|
||||||
|
for entity, count in entity_to_count.items():
|
||||||
|
entity_file.write(entity + "|" + str(count) + "\n")
|
||||||
|
|
||||||
|
if to_print:
|
||||||
|
for entity, count in entity_to_count.items():
|
||||||
|
print("Entity count:", entity, count)
|
||||||
|
print("Total count:", total_count)
|
||||||
|
|
||||||
|
|
||||||
|
def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_entity_defs=True):
|
||||||
|
""" STEP 3: create the knowledge base """
|
||||||
kb = KnowledgeBase(vocab=vocab)
|
kb = KnowledgeBase(vocab=vocab)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
@ -52,6 +201,13 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
|
||||||
# title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
|
# title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
|
||||||
title_to_id = _read_wikidata_entities_json(limit=None)
|
title_to_id = _read_wikidata_entities_json(limit=None)
|
||||||
|
|
||||||
|
# write the title-ID mapping to file
|
||||||
|
if write_entity_defs:
|
||||||
|
with open(ENTITY_DEFS, mode='w', encoding='utf8') as entity_file:
|
||||||
|
entity_file.write("WP_title" + "|" + "WD_id" + "\n")
|
||||||
|
for title, qid in title_to_id.items():
|
||||||
|
entity_file.write(title + "|" + str(qid) + "\n")
|
||||||
|
|
||||||
title_list = list(title_to_id.keys())
|
title_list = list(title_to_id.keys())
|
||||||
entity_list = [title_to_id[x] for x in title_list]
|
entity_list = [title_to_id[x] for x in title_list]
|
||||||
|
|
||||||
|
@ -94,37 +250,16 @@ def _get_entity_frequencies(entities):
|
||||||
return [entity_to_count.get(e, 0) for e in entities]
|
return [entity_to_count.get(e, 0) for e in entities]
|
||||||
|
|
||||||
|
|
||||||
def _write_entity_counts(to_print=False):
|
def _get_entity_to_id():
|
||||||
entity_to_count = dict()
|
entity_to_id = dict()
|
||||||
total_count = 0
|
with open(ENTITY_DEFS, 'r', encoding='utf8') as csvfile:
|
||||||
|
csvreader = csv.reader(csvfile, delimiter='|')
|
||||||
with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
|
|
||||||
# skip header
|
# skip header
|
||||||
prior_file.readline()
|
next(csvreader)
|
||||||
line = prior_file.readline()
|
for row in csvreader:
|
||||||
|
entity_to_id[row[0]] = row[1]
|
||||||
|
|
||||||
while line:
|
return entity_to_id
|
||||||
splits = line.replace('\n', "").split(sep='|')
|
|
||||||
# alias = splits[0]
|
|
||||||
count = int(splits[1])
|
|
||||||
entity = splits[2]
|
|
||||||
|
|
||||||
current_count = entity_to_count.get(entity, 0)
|
|
||||||
entity_to_count[entity] = current_count + count
|
|
||||||
|
|
||||||
total_count += count
|
|
||||||
|
|
||||||
line = prior_file.readline()
|
|
||||||
|
|
||||||
with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file:
|
|
||||||
entity_file.write("entity" + "|" + "count" + "\n")
|
|
||||||
for entity, count in entity_to_count.items():
|
|
||||||
entity_file.write(entity + "|" + str(count) + "\n")
|
|
||||||
|
|
||||||
if to_print:
|
|
||||||
for entity, count in entity_to_count.items():
|
|
||||||
print("Entity count:", entity, count)
|
|
||||||
print("Total count:", total_count)
|
|
||||||
|
|
||||||
|
|
||||||
def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False):
|
def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False):
|
||||||
|
@ -337,85 +472,60 @@ def _read_wikidata_entities_regex_depr(limit=None, to_print=False):
|
||||||
return title_to_id
|
return title_to_id
|
||||||
|
|
||||||
|
|
||||||
def _read_wikipedia_prior_probs():
|
def test_kb(kb):
|
||||||
""" Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
|
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
|
||||||
The full file takes about 2h to parse 1100M lines (update printed every 5M lines)
|
nlp = spacy.load('en_core_web_sm')
|
||||||
"""
|
|
||||||
|
|
||||||
# find the links
|
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
|
||||||
link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
|
nlp.add_pipe(el_pipe, last=True)
|
||||||
|
|
||||||
# match on interwiki links, e.g. `en:` or `:fr:`
|
candidates = my_kb.get_candidates("Bush")
|
||||||
ns_regex = r":?" + "[a-z][a-z]" + ":"
|
|
||||||
|
|
||||||
# match on Namespace: optionally preceded by a :
|
print("generating candidates for 'Bush' :")
|
||||||
for ns in wiki_namespaces:
|
for c in candidates:
|
||||||
ns_regex += "|" + ":?" + ns + ":"
|
print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
|
||||||
|
print()
|
||||||
|
|
||||||
ns_regex = re.compile(ns_regex, re.IGNORECASE)
|
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
||||||
|
"Douglas reminds us to always bring our towel. " \
|
||||||
|
"The main character in Doug's novel is the man Arthur Dent, " \
|
||||||
|
"but Douglas doesn't write about George Washington or Homer Simpson."
|
||||||
|
doc = nlp(text)
|
||||||
|
|
||||||
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
|
for ent in doc.ents:
|
||||||
line = file.readline()
|
print("ent", ent.text, ent.label_, ent.kb_id_)
|
||||||
cnt = 0
|
|
||||||
while line:
|
|
||||||
if cnt % 5000000 == 0:
|
|
||||||
print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
|
|
||||||
clean_line = line.strip().decode("utf-8")
|
|
||||||
|
|
||||||
matches = link_regex.findall(clean_line)
|
|
||||||
for match in matches:
|
|
||||||
match = match[2:][:-2].replace("_", " ").strip()
|
|
||||||
|
|
||||||
if ns_regex.match(match):
|
|
||||||
pass # ignore namespaces at the beginning of the string
|
|
||||||
|
|
||||||
# this is a simple link, with the alias the same as the mention
|
|
||||||
elif "|" not in match:
|
|
||||||
_store_alias(match, match, normalize_alias=True, normalize_entity=True)
|
|
||||||
|
|
||||||
# in wiki format, the link is written as [[entity|alias]]
|
|
||||||
else:
|
|
||||||
splits = match.split("|")
|
|
||||||
entity = splits[0].strip()
|
|
||||||
alias = splits[1].strip()
|
|
||||||
# specific wiki format [[alias (specification)|]]
|
|
||||||
if len(alias) == 0 and "(" in entity:
|
|
||||||
alias = entity.split("(")[0]
|
|
||||||
_store_alias(alias, entity, normalize_alias=False, normalize_entity=True)
|
|
||||||
else:
|
|
||||||
_store_alias(alias, entity, normalize_alias=False, normalize_entity=True)
|
|
||||||
|
|
||||||
line = file.readline()
|
|
||||||
cnt += 1
|
|
||||||
|
|
||||||
# write all aliases and their entities and occurrences to file
|
|
||||||
with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile:
|
|
||||||
outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
|
|
||||||
for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
|
|
||||||
for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
|
|
||||||
outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
|
def add_coref():
|
||||||
alias = alias.strip()
|
""" STEP 5: add coreference resolution to our model """
|
||||||
entity = entity.strip()
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
# nlp = spacy.load('en')
|
||||||
|
|
||||||
# remove everything after # as this is not part of the title but refers to a specific paragraph
|
# TODO: this doesn't work yet
|
||||||
if normalize_entity:
|
# neuralcoref.add_to_pipe(nlp)
|
||||||
# wikipedia titles are always capitalized
|
print("done adding to pipe")
|
||||||
entity = capitalize_first(entity.split("#")[0])
|
|
||||||
if normalize_alias:
|
|
||||||
alias = alias.split("#")[0]
|
|
||||||
|
|
||||||
if alias and entity:
|
doc = nlp(u'My sister has a dog. She loves him.')
|
||||||
alias_dict = map_alias_to_link.get(alias, dict())
|
print("done doc")
|
||||||
entity_count = alias_dict.get(entity, 0)
|
|
||||||
alias_dict[entity] = entity_count + 1
|
print(doc._.has_coref)
|
||||||
map_alias_to_link[alias] = alias_dict
|
print(doc._.coref_clusters)
|
||||||
|
|
||||||
|
|
||||||
def _read_wikipedia():
|
def create_training():
|
||||||
""" Read the XML wikipedia data """
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
wp_to_id = _get_entity_to_id()
|
||||||
|
_read_wikipedia(nlp, wp_to_id, limit=10000)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_wikipedia(nlp, wp_to_id, limit=None):
|
||||||
|
""" Read the XML wikipedia data to parse out training data """
|
||||||
|
|
||||||
|
# regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
|
||||||
|
# regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
|
||||||
|
|
||||||
|
title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
|
||||||
|
id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
|
||||||
|
|
||||||
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
|
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
|
||||||
line = file.readline()
|
line = file.readline()
|
||||||
|
@ -424,19 +534,19 @@ def _read_wikipedia():
|
||||||
article_title = None
|
article_title = None
|
||||||
article_id = None
|
article_id = None
|
||||||
reading_text = False
|
reading_text = False
|
||||||
while line and cnt < 1000000:
|
while line and (not limit or cnt < limit):
|
||||||
clean_line = line.strip().decode("utf-8")
|
clean_line = line.strip().decode("utf-8")
|
||||||
|
|
||||||
# Start reading new page
|
# Start reading new page
|
||||||
if clean_line == "<page>":
|
if clean_line == "<page>":
|
||||||
article_text = ""
|
article_text = ""
|
||||||
article_title = None
|
article_title = None
|
||||||
article_id = 342
|
article_id = None
|
||||||
|
|
||||||
# finished reading this page
|
# finished reading this page
|
||||||
elif clean_line == "</page>":
|
elif clean_line == "</page>":
|
||||||
if article_id:
|
if article_id:
|
||||||
_store_wp_article(article_id, article_title, article_text.strip())
|
_process_wp_text(nlp, wp_to_id, article_id, article_title, article_text.strip())
|
||||||
|
|
||||||
# start reading text within a page
|
# start reading text within a page
|
||||||
if "<text" in clean_line:
|
if "<text" in clean_line:
|
||||||
|
@ -445,17 +555,17 @@ def _read_wikipedia():
|
||||||
if reading_text:
|
if reading_text:
|
||||||
article_text += " " + clean_line
|
article_text += " " + clean_line
|
||||||
|
|
||||||
# stop reading text within a page
|
# stop reading text within a page (we assume a new page doesn't start on the same line)
|
||||||
if "</text" in clean_line:
|
if "</text" in clean_line:
|
||||||
reading_text = False
|
reading_text = False
|
||||||
|
|
||||||
# read the ID of this article
|
# read the ID of this article
|
||||||
ids = re.findall(r"(?<=<id>)\d*(?=</id>)", clean_line)
|
ids = id_regex.search(clean_line)
|
||||||
if ids:
|
if ids:
|
||||||
article_id = ids[0]
|
article_id = ids[0]
|
||||||
|
|
||||||
# read the title of this article
|
# read the title of this article
|
||||||
titles = re.findall(r"(?<=<title>).*(?=</title>)", clean_line)
|
titles = title_regex.search(clean_line)
|
||||||
if titles:
|
if titles:
|
||||||
article_title = titles[0].strip()
|
article_title = titles[0].strip()
|
||||||
|
|
||||||
|
@ -463,107 +573,145 @@ def _read_wikipedia():
|
||||||
cnt += 1
|
cnt += 1
|
||||||
|
|
||||||
|
|
||||||
def _store_wp_article(article_id, article_title, article_text):
|
def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text):
|
||||||
pass
|
# remove the text tags
|
||||||
|
text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text>)')
|
||||||
|
text = text_regex.search(article_text).group(0)
|
||||||
|
|
||||||
|
# stop processing if this is a redirect page
|
||||||
|
if text.startswith("#REDIRECT"):
|
||||||
|
return
|
||||||
|
|
||||||
print("WP article", article_id, ":", article_title)
|
print("WP article", article_id, ":", article_title)
|
||||||
print(article_text)
|
|
||||||
print(_get_clean_wp_text(article_text))
|
article_dict = dict()
|
||||||
|
aliases, entities, normalizations = _get_wp_links(text)
|
||||||
|
for alias, entity, norm in zip(aliases, entities, normalizations):
|
||||||
|
entity_id = wp_to_id.get(entity)
|
||||||
|
if entity_id:
|
||||||
|
# print(" ", alias, '-->', entity, '-->', entity_id)
|
||||||
|
article_dict[alias] = entity_id
|
||||||
|
article_dict[entity] = entity_id
|
||||||
|
|
||||||
|
# get the raw text without markup etc
|
||||||
|
clean_text = _get_clean_wp_text(text)
|
||||||
|
|
||||||
|
#print(text)
|
||||||
|
print(clean_text)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
_run_ner(nlp, article_id, article_title, clean_text, article_dict)
|
||||||
|
|
||||||
|
|
||||||
|
info_regex = re.compile(r'{[^{]*?}')
|
||||||
|
interwiki_regex = re.compile(r'\[\[([^|]*?)]]')
|
||||||
|
interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]')
|
||||||
|
htlm_regex = re.compile(r'<!--[^!]*-->')
|
||||||
|
category_regex = re.compile(r'\[\[Category:[^\[]*]]')
|
||||||
|
file_regex = re.compile(r'\[\[File:[^[\]]+]]')
|
||||||
|
ref_regex = re.compile(r'<ref.*?>') # non-greedy
|
||||||
|
ref_2_regex = re.compile(r'</ref.*?>') # non-greedy
|
||||||
|
|
||||||
|
|
||||||
def _get_clean_wp_text(article_text):
|
def _get_clean_wp_text(article_text):
|
||||||
# TODO: compile the regular expressions
|
clean_text = article_text.strip()
|
||||||
|
|
||||||
# remove Category and File statements
|
# remove bolding & italic markup
|
||||||
clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', article_text)
|
clean_text = clean_text.replace('\'\'\'', '')
|
||||||
print("1", clean_text)
|
clean_text = clean_text.replace('\'\'', '')
|
||||||
clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text) # TODO: this doesn't work yet
|
|
||||||
print("2", clean_text)
|
|
||||||
|
|
||||||
# remove bolding markup
|
|
||||||
clean_text = re.sub('\'\'\'', '', clean_text)
|
|
||||||
clean_text = re.sub('\'\'', '', clean_text)
|
|
||||||
|
|
||||||
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
|
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
|
||||||
try_again = True
|
try_again = True
|
||||||
previous_length = len(clean_text)
|
previous_length = len(clean_text)
|
||||||
while try_again:
|
while try_again:
|
||||||
clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match excluding a nested {
|
clean_text = info_regex.sub('', clean_text) # non-greedy match excluding a nested {
|
||||||
if len(clean_text) < previous_length:
|
if len(clean_text) < previous_length:
|
||||||
try_again = True
|
try_again = True
|
||||||
else:
|
else:
|
||||||
try_again = False
|
try_again = False
|
||||||
previous_length = len(clean_text)
|
previous_length = len(clean_text)
|
||||||
|
|
||||||
# remove multiple spaces
|
|
||||||
while ' ' in clean_text:
|
|
||||||
clean_text = re.sub(' ', ' ', clean_text)
|
|
||||||
|
|
||||||
# remove simple interwiki links (no alternative name)
|
# remove simple interwiki links (no alternative name)
|
||||||
clean_text = re.sub('\[\[([^|]*?)]]', r'\1', clean_text)
|
clean_text = interwiki_regex.sub(r'\1', clean_text)
|
||||||
|
|
||||||
# remove simple interwiki links by picking the alternative name
|
# remove simple interwiki links by picking the alternative name
|
||||||
clean_text = re.sub(r'\[\[[^|]*?\|([^|]*?)]]', r'\1', clean_text)
|
clean_text = interwiki_2_regex.sub(r'\1', clean_text)
|
||||||
|
|
||||||
# remove HTML comments
|
# remove HTML comments
|
||||||
clean_text = re.sub('<!--[^!]*-->', '', clean_text)
|
clean_text = htlm_regex.sub('', clean_text)
|
||||||
|
|
||||||
return clean_text
|
# remove Category and File statements
|
||||||
|
clean_text = category_regex.sub('', clean_text)
|
||||||
|
clean_text = file_regex.sub('', clean_text)
|
||||||
|
|
||||||
|
# remove multiple =
|
||||||
|
while '==' in clean_text:
|
||||||
|
clean_text = clean_text.replace("==", "=")
|
||||||
|
|
||||||
|
clean_text = clean_text.replace(". =", ".")
|
||||||
|
clean_text = clean_text.replace(" = ", ". ")
|
||||||
|
clean_text = clean_text.replace("= ", ".")
|
||||||
|
clean_text = clean_text.replace(" =", "")
|
||||||
|
|
||||||
|
# remove refs (non-greedy match)
|
||||||
|
clean_text = ref_regex.sub('', clean_text)
|
||||||
|
clean_text = ref_2_regex.sub('', clean_text)
|
||||||
|
|
||||||
|
# remove additional wikiformatting
|
||||||
|
clean_text = re.sub(r'<blockquote>', '', clean_text)
|
||||||
|
clean_text = re.sub(r'</blockquote>', '', clean_text)
|
||||||
|
|
||||||
|
# change special characters back to normal ones
|
||||||
|
clean_text = clean_text.replace(r'<', '<')
|
||||||
|
clean_text = clean_text.replace(r'>', '>')
|
||||||
|
clean_text = clean_text.replace(r'"', '"')
|
||||||
|
clean_text = clean_text.replace(r'&nbsp;', ' ')
|
||||||
|
clean_text = clean_text.replace(r'&', '&')
|
||||||
|
|
||||||
|
# remove multiple spaces
|
||||||
|
while ' ' in clean_text:
|
||||||
|
clean_text = clean_text.replace(' ', ' ')
|
||||||
|
|
||||||
|
return clean_text.strip()
|
||||||
|
|
||||||
|
|
||||||
def add_el(kb, nlp):
|
def _run_ner(nlp, article_id, article_title, clean_text, article_dict):
|
||||||
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
|
pass # TODO
|
||||||
nlp.add_pipe(el_pipe, last=True)
|
|
||||||
|
|
||||||
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
|
||||||
"Douglas reminds us to always bring our towel. " \
|
|
||||||
"The main character in Doug's novel is the man Arthur Dent, " \
|
|
||||||
"but Douglas doesn't write about George Washington or Homer Simpson."
|
|
||||||
doc = nlp(text)
|
|
||||||
|
|
||||||
print()
|
|
||||||
for token in doc:
|
|
||||||
print("token", token.text, token.ent_type_, token.ent_kb_id_)
|
|
||||||
|
|
||||||
print()
|
|
||||||
for ent in doc.ents:
|
|
||||||
print("ent", ent.text, ent.label_, ent.kb_id_)
|
|
||||||
|
|
||||||
|
|
||||||
def capitalize_first(text):
|
|
||||||
if not text:
|
|
||||||
return None
|
|
||||||
result = text[0].capitalize()
|
|
||||||
if len(result) > 0:
|
|
||||||
result += text[1:]
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print("START", datetime.datetime.now())
|
print("START", datetime.datetime.now())
|
||||||
|
print()
|
||||||
|
my_kb = None
|
||||||
|
|
||||||
|
# one-time methods to create KB and write to file
|
||||||
to_create_prior_probs = False
|
to_create_prior_probs = False
|
||||||
to_create_entity_counts = False
|
to_create_entity_counts = False
|
||||||
to_create_kb = False
|
to_create_kb = False
|
||||||
to_read_kb = True
|
|
||||||
|
# read KB back in from file
|
||||||
|
to_read_kb = False
|
||||||
|
to_test_kb = False
|
||||||
|
|
||||||
|
create_wp_training = True
|
||||||
|
|
||||||
# STEP 1 : create prior probabilities from WP
|
# STEP 1 : create prior probabilities from WP
|
||||||
# run only once !
|
# run only once !
|
||||||
if to_create_prior_probs:
|
if to_create_prior_probs:
|
||||||
print("STEP 1: to_create_prior_probs", datetime.datetime.now())
|
print("STEP 1: to_create_prior_probs", datetime.datetime.now())
|
||||||
_read_wikipedia_prior_probs()
|
read_wikipedia_prior_probs()
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# STEP 2 : deduce entity frequencies from WP
|
# STEP 2 : deduce entity frequencies from WP
|
||||||
# run only once !
|
# run only once !
|
||||||
if to_create_entity_counts:
|
if to_create_entity_counts:
|
||||||
print("STEP 2: to_create_entity_counts", datetime.datetime.now())
|
print("STEP 2: to_create_entity_counts", datetime.datetime.now())
|
||||||
_write_entity_counts()
|
write_entity_counts()
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
# STEP 3 : create KB and write to file
|
||||||
|
# run only once !
|
||||||
if to_create_kb:
|
if to_create_kb:
|
||||||
# STEP 3 : create KB
|
print("STEP 3a: to_create_kb", datetime.datetime.now())
|
||||||
print("STEP 3: to_create_kb", datetime.datetime.now())
|
|
||||||
my_nlp = spacy.load('en_core_web_sm')
|
my_nlp = spacy.load('en_core_web_sm')
|
||||||
my_vocab = my_nlp.vocab
|
my_vocab = my_nlp.vocab
|
||||||
my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
|
my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
|
||||||
|
@ -571,15 +719,14 @@ if __name__ == "__main__":
|
||||||
print("kb aliases:", my_kb.get_size_aliases())
|
print("kb aliases:", my_kb.get_size_aliases())
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# STEP 4 : write KB to file
|
print("STEP 3b: write KB", datetime.datetime.now())
|
||||||
print("STEP 4: write KB", datetime.datetime.now())
|
|
||||||
my_kb.dump(KB_FILE)
|
my_kb.dump(KB_FILE)
|
||||||
my_vocab.to_disk(VOCAB_DIR)
|
my_vocab.to_disk(VOCAB_DIR)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
# STEP 4 : read KB back in from file
|
||||||
if to_read_kb:
|
if to_read_kb:
|
||||||
# STEP 5 : read KB back in from file
|
print("STEP 4: to_read_kb", datetime.datetime.now())
|
||||||
print("STEP 5: to_read_kb", datetime.datetime.now())
|
|
||||||
my_vocab = Vocab()
|
my_vocab = Vocab()
|
||||||
my_vocab.from_disk(VOCAB_DIR)
|
my_vocab.from_disk(VOCAB_DIR)
|
||||||
my_kb = KnowledgeBase(vocab=my_vocab)
|
my_kb = KnowledgeBase(vocab=my_vocab)
|
||||||
|
@ -589,16 +736,17 @@ if __name__ == "__main__":
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# test KB
|
# test KB
|
||||||
candidates = my_kb.get_candidates("Bush")
|
if to_test_kb:
|
||||||
for c in candidates:
|
test_kb(my_kb)
|
||||||
print("entity:", c.entity_)
|
|
||||||
print("entity freq:", c.entity_freq)
|
|
||||||
print("alias:", c.alias_)
|
|
||||||
print("prior prob:", c.prior_prob)
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# STEP 6: add KB to NLP pipeline
|
# STEP 5: create a training dataset from WP
|
||||||
# print("STEP 6: use KB", datetime.datetime.now())
|
if create_wp_training:
|
||||||
# add_el(my_kb, nlp)
|
print("STEP 5: create training dataset", datetime.datetime.now())
|
||||||
|
create_training()
|
||||||
|
|
||||||
|
# TODO coreference resolution
|
||||||
|
# add_coref()
|
||||||
|
|
||||||
|
print()
|
||||||
print("STOP", datetime.datetime.now())
|
print("STOP", datetime.datetime.now())
|
||||||
|
|
Loading…
Reference in New Issue
Block a user