parsing clean text from WP articles to use as input data for NER and NEL

2025-10-15 08:16:36 +03:00 · 2019-05-02 17:09:56 +02:00 · 2019-05-02 17:09:56 +02:00 · 581dc9742d
commit 581dc9742d
parent 8353552191
1 changed files with 320 additions and 172 deletions
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@ -10,9 +10,13 @@ import json
 import spacy
 import datetime
 import bz2
 from spacy.kb import KnowledgeBase
 from spacy.vocab import Vocab
 # requires: pip install neuralcoref --no-binary neuralcoref
 # import neuralcoref
 # TODO: remove hardcoded paths
 WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
 ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
@ -20,6 +24,7 @@ ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-ar
 PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
 ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
 ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
 VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
@ -43,7 +48,151 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
 map_alias_to_link = dict()
-def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
+def read_wikipedia_prior_probs():
    """
    STEP 1: Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
    The full file takes about 2h to parse 1100M lines (update printed every 5M lines).
    It works relatively fast because we don't care about which article we parsed the interwiki from,
    we just process line by line.
    """
    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
        line = file.readline()
        cnt = 0
        while line:
            if cnt % 5000000 == 0:
                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
            clean_line = line.strip().decode("utf-8")
            aliases, entities, normalizations = _get_wp_links(clean_line)
            for alias, entity, norm in zip(aliases, entities, normalizations):
                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
            line = file.readline()
            cnt += 1
    # write all aliases and their entities and occurrences to file
    with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile:
        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
 # find the links
 link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
 # match on interwiki links, e.g. `en:` or `:fr:`
 ns_regex = r":?" + "[a-z][a-z]" + ":"
 # match on Namespace: optionally preceded by a :
 for ns in wiki_namespaces:
    ns_regex += "|" + ":?" + ns + ":"
 ns_regex = re.compile(ns_regex, re.IGNORECASE)
 def _get_wp_links(text):
    aliases = []
    entities = []
    normalizations = []
    matches = link_regex.findall(text)
    for match in matches:
        match = match[2:][:-2].replace("_", " ").strip()
        if ns_regex.match(match):
            pass  # ignore namespaces at the beginning of the string
        # this is a simple link, with the alias the same as the mention
        elif "|" not in match:
            aliases.append(match)
            entities.append(match)
            normalizations.append(True)
        # in wiki format, the link is written as [[entity|alias]]
        else:
            splits = match.split("|")
            entity = splits[0].strip()
            alias = splits[1].strip()
            # specific wiki format  [[alias (specification)|]]
            if len(alias) == 0 and "(" in entity:
                alias = entity.split("(")[0]
                aliases.append(alias)
                entities.append(entity)
                normalizations.append(False)
            else:
                aliases.append(alias)
                entities.append(entity)
                normalizations.append(False)
    return aliases, entities, normalizations
 def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
    alias = alias.strip()
    entity = entity.strip()
    # remove everything after # as this is not part of the title but refers to a specific paragraph
    if normalize_entity:
        # wikipedia titles are always capitalized
        entity = _capitalize_first(entity.split("#")[0])
    if normalize_alias:
        alias = alias.split("#")[0]
    if alias and entity:
        alias_dict = map_alias_to_link.get(alias, dict())
        entity_count = alias_dict.get(entity, 0)
        alias_dict[entity] = entity_count + 1
        map_alias_to_link[alias] = alias_dict
 def _capitalize_first(text):
    if not text:
        return None
    result = text[0].capitalize()
    if len(result) > 0:
        result += text[1:]
    return result
 def write_entity_counts(to_print=False):
    """ STEP 2: write entity counts  """
    entity_to_count = dict()
    total_count = 0
    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
        # skip header
        prior_file.readline()
        line = prior_file.readline()
        while line:
            splits = line.replace('\n', "").split(sep='|')
            # alias = splits[0]
            count = int(splits[1])
            entity = splits[2]
            current_count = entity_to_count.get(entity, 0)
            entity_to_count[entity] = current_count + count
            total_count += count
            line = prior_file.readline()
    with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file:
        entity_file.write("entity" + "|" + "count" + "\n")
        for entity, count in entity_to_count.items():
            entity_file.write(entity + "|" + str(count) + "\n")
    if to_print:
        for entity, count in entity_to_count.items():
            print("Entity count:", entity, count)
        print("Total count:", total_count)
 def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_entity_defs=True):
    """ STEP 3: create the knowledge base """
    kb = KnowledgeBase(vocab=vocab)
    print()
@ -52,6 +201,13 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
    # title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
    title_to_id = _read_wikidata_entities_json(limit=None)
    # write the title-ID mapping to file
    if write_entity_defs:
        with open(ENTITY_DEFS, mode='w', encoding='utf8') as entity_file:
            entity_file.write("WP_title" + "|" + "WD_id" + "\n")
            for title, qid in title_to_id.items():
                entity_file.write(title + "|" + str(qid) + "\n")
    title_list = list(title_to_id.keys())
    entity_list = [title_to_id[x] for x in title_list]
@ -94,37 +250,16 @@ def _get_entity_frequencies(entities):
    return [entity_to_count.get(e, 0) for e in entities]
-def _write_entity_counts(to_print=False):
+def _get_entity_to_id():
-    entity_to_count = dict()
+    entity_to_id = dict()
-    total_count = 0
+    with open(ENTITY_DEFS, 'r', encoding='utf8') as csvfile:
-
+        csvreader = csv.reader(csvfile, delimiter='|')
    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
        # skip header
-        prior_file.readline()
+        next(csvreader)
-        line = prior_file.readline()
+        for row in csvreader:
            entity_to_id[row[0]] = row[1]
-        while line:
+    return entity_to_id
            splits = line.replace('\n', "").split(sep='|')
            # alias = splits[0]
            count = int(splits[1])
            entity = splits[2]
            current_count = entity_to_count.get(entity, 0)
            entity_to_count[entity] = current_count + count
            total_count += count
            line = prior_file.readline()
    with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file:
        entity_file.write("entity" + "|" + "count" + "\n")
        for entity, count in entity_to_count.items():
            entity_file.write(entity + "|" + str(count) + "\n")
    if to_print:
        for entity, count in entity_to_count.items():
            print("Entity count:", entity, count)
        print("Total count:", total_count)
 def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False):
@ -337,85 +472,60 @@ def _read_wikidata_entities_regex_depr(limit=None, to_print=False):
    return title_to_id
-def _read_wikipedia_prior_probs():
+def test_kb(kb):
-    """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
+    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
-    The full file takes about 2h to parse 1100M lines (update printed every 5M lines)
+    nlp = spacy.load('en_core_web_sm')
     """
-    # find the links
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
-    link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
+    nlp.add_pipe(el_pipe, last=True)
-    # match on interwiki links, e.g. `en:` or `:fr:`
+    candidates = my_kb.get_candidates("Bush")
    ns_regex = r":?" + "[a-z][a-z]" + ":"
-    # match on Namespace: optionally preceded by a :
+    print("generating candidates for 'Bush' :")
-    for ns in wiki_namespaces:
+    for c in candidates:
-        ns_regex += "|" + ":?" + ns + ":"
+        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
    print()
-    ns_regex = re.compile(ns_regex, re.IGNORECASE)
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
           "Douglas reminds us to always bring our towel. " \
           "The main character in Doug's novel is the man Arthur Dent, " \
           "but Douglas doesn't write about George Washington or Homer Simpson."
    doc = nlp(text)
-    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
+    for ent in doc.ents:
-        line = file.readline()
+        print("ent", ent.text, ent.label_, ent.kb_id_)
        cnt = 0
        while line:
            if cnt % 5000000 == 0:
                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
            clean_line = line.strip().decode("utf-8")
            matches = link_regex.findall(clean_line)
            for match in matches:
                match = match[2:][:-2].replace("_", " ").strip()
                if ns_regex.match(match):
                    pass  # ignore namespaces at the beginning of the string
                # this is a simple link, with the alias the same as the mention
                elif "|" not in match:
                    _store_alias(match, match, normalize_alias=True, normalize_entity=True)
                # in wiki format, the link is written as [[entity|alias]]
                else:
                    splits = match.split("|")
                    entity = splits[0].strip()
                    alias = splits[1].strip()
                    # specific wiki format  [[alias (specification)|]]
                    if len(alias) == 0 and "(" in entity:
                        alias = entity.split("(")[0]
                        _store_alias(alias, entity, normalize_alias=False, normalize_entity=True)
                    else:
                        _store_alias(alias, entity, normalize_alias=False, normalize_entity=True)
            line = file.readline()
            cnt += 1
    # write all aliases and their entities and occurrences to file
    with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile:
        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
-def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
+def add_coref():
-    alias = alias.strip()
+    """ STEP 5: add coreference resolution to our model """
-    entity = entity.strip()
+    nlp = spacy.load('en_core_web_sm')
    # nlp = spacy.load('en')
-    # remove everything after # as this is not part of the title but refers to a specific paragraph
+    # TODO: this doesn't work yet
-    if normalize_entity:
+    # neuralcoref.add_to_pipe(nlp)
-        # wikipedia titles are always capitalized
+    print("done adding to pipe")
        entity = capitalize_first(entity.split("#")[0])
    if normalize_alias:
        alias = alias.split("#")[0]
-    if alias and entity:
+    doc = nlp(u'My sister has a dog. She loves him.')
-        alias_dict = map_alias_to_link.get(alias, dict())
+    print("done doc")
-        entity_count = alias_dict.get(entity, 0)
+
-        alias_dict[entity] = entity_count + 1
+    print(doc._.has_coref)
-        map_alias_to_link[alias] = alias_dict
+    print(doc._.coref_clusters)
-def _read_wikipedia():
+def create_training():
-    """ Read the XML wikipedia data """
+    nlp = spacy.load('en_core_web_sm')
    wp_to_id = _get_entity_to_id()
    _read_wikipedia(nlp, wp_to_id, limit=10000)
 def _read_wikipedia(nlp, wp_to_id, limit=None):
    """ Read the XML wikipedia data to parse out training data """
    # regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
    # regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
        line = file.readline()
@ -424,19 +534,19 @@ def _read_wikipedia():
        article_title = None
        article_id = None
        reading_text = False
-        while line and cnt < 1000000:
+        while line and (not limit or cnt < limit):
            clean_line = line.strip().decode("utf-8")
            # Start reading new page
            if clean_line == "<page>":
                article_text = ""
                article_title = None
-                article_id = 342
+                article_id = None
            # finished reading this page
            elif clean_line == "</page>":
                if article_id:
-                    _store_wp_article(article_id, article_title, article_text.strip())
+                    _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text.strip())
            # start reading text within a page
            if "<text" in clean_line:
@ -445,17 +555,17 @@ def _read_wikipedia():
            if reading_text:
                article_text += " " + clean_line
-            # stop reading text within a page
+            # stop reading text within a page (we assume a new page doesn't start on the same line)
            if "</text" in clean_line:
                reading_text = False
            # read the ID of this article
-            ids = re.findall(r"(?<=<id>)\d*(?=</id>)", clean_line)
+            ids = id_regex.search(clean_line)
            if ids:
                article_id = ids[0]
            # read the title of this article
-            titles = re.findall(r"(?<=<title>).*(?=</title>)", clean_line)
+            titles = title_regex.search(clean_line)
            if titles:
                article_title = titles[0].strip()
@ -463,107 +573,145 @@ def _read_wikipedia():
            cnt += 1
-def _store_wp_article(article_id, article_title, article_text):
+def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text):
-    pass
+    # remove the text tags
    text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text>)')
    text = text_regex.search(article_text).group(0)
    # stop processing if this is a redirect page
    if text.startswith("#REDIRECT"):
        return
    print("WP article", article_id, ":", article_title)
-    print(article_text)
+
-    print(_get_clean_wp_text(article_text))
+    article_dict = dict()
    aliases, entities, normalizations = _get_wp_links(text)
    for alias, entity, norm in zip(aliases, entities, normalizations):
        entity_id = wp_to_id.get(entity)
        if entity_id:
            # print(" ", alias, '-->', entity, '-->', entity_id)
            article_dict[alias] = entity_id
            article_dict[entity] = entity_id
    # get the raw text without markup etc
    clean_text = _get_clean_wp_text(text)
    #print(text)
    print(clean_text)
    print()
    _run_ner(nlp, article_id, article_title, clean_text, article_dict)
 info_regex = re.compile(r'{[^{]*?}')
 interwiki_regex = re.compile(r'\[\[([^|]*?)]]')
 interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]')
 htlm_regex = re.compile(r'&lt;!--[^!]*--&gt;')
 category_regex = re.compile(r'\[\[Category:[^\[]*]]')
 file_regex = re.compile(r'\[\[File:[^[\]]+]]')
 ref_regex = re.compile(r'&lt;ref.*?&gt;')     # non-greedy
 ref_2_regex = re.compile(r'&lt;/ref.*?&gt;')  # non-greedy
 def _get_clean_wp_text(article_text):
-    # TODO: compile the regular expressions
+    clean_text = article_text.strip()
-    # remove Category and File statements
+    # remove bolding & italic markup
-    clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', article_text)
+    clean_text = clean_text.replace('\'\'\'', '')
-    print("1", clean_text)
+    clean_text = clean_text.replace('\'\'', '')
    clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text)       # TODO: this doesn't work yet
    print("2", clean_text)
    # remove bolding markup
    clean_text = re.sub('\'\'\'', '', clean_text)
    clean_text = re.sub('\'\'', '', clean_text)
    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
    try_again = True
    previous_length = len(clean_text)
    while try_again:
-        clean_text = re.sub('{[^{]*?}', '', clean_text)  # non-greedy match excluding a nested {
+        clean_text = info_regex.sub('', clean_text)  # non-greedy match excluding a nested {
        if len(clean_text) < previous_length:
            try_again = True
        else:
            try_again = False
        previous_length = len(clean_text)
    # remove multiple spaces
    while '  ' in clean_text:
        clean_text = re.sub('  ', ' ', clean_text)
    # remove simple interwiki links (no alternative name)
-    clean_text = re.sub('\[\[([^|]*?)]]', r'\1', clean_text)
+    clean_text = interwiki_regex.sub(r'\1', clean_text)
    # remove simple interwiki links by picking the alternative name
-    clean_text = re.sub(r'\[\[[^|]*?\|([^|]*?)]]', r'\1', clean_text)
+    clean_text = interwiki_2_regex.sub(r'\1', clean_text)
    # remove HTML comments
-    clean_text = re.sub('&lt;!--[^!]*--&gt;', '', clean_text)
+    clean_text = htlm_regex.sub('', clean_text)
-    return clean_text
+    # remove Category and File statements
    clean_text = category_regex.sub('', clean_text)
    clean_text = file_regex.sub('', clean_text)
    # remove multiple =
    while '==' in clean_text:
        clean_text = clean_text.replace("==", "=")
    clean_text = clean_text.replace(". =", ".")
    clean_text = clean_text.replace(" = ", ". ")
    clean_text = clean_text.replace("= ", ".")
    clean_text = clean_text.replace(" =", "")
    # remove refs (non-greedy match)
    clean_text = ref_regex.sub('', clean_text)
    clean_text = ref_2_regex.sub('', clean_text)
    # remove additional wikiformatting
    clean_text = re.sub(r'&lt;blockquote&gt;', '', clean_text)
    clean_text = re.sub(r'&lt;/blockquote&gt;', '', clean_text)
    # change special characters back to normal ones
    clean_text = clean_text.replace(r'&lt;', '<')
    clean_text = clean_text.replace(r'&gt;', '>')
    clean_text = clean_text.replace(r'&quot;', '"')
    clean_text = clean_text.replace(r'&amp;nbsp;', ' ')
    clean_text = clean_text.replace(r'&amp;', '&')
    # remove multiple spaces
    while '  ' in clean_text:
        clean_text = clean_text.replace('  ', ' ')
    return clean_text.strip()
-def add_el(kb, nlp):
+def _run_ner(nlp, article_id, article_title, clean_text, article_dict):
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
+    pass # TODO
    nlp.add_pipe(el_pipe, last=True)
    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
           "Douglas reminds us to always bring our towel. " \
           "The main character in Doug's novel is the man Arthur Dent, " \
           "but Douglas doesn't write about George Washington or Homer Simpson."
    doc = nlp(text)
    print()
    for token in doc:
        print("token", token.text, token.ent_type_, token.ent_kb_id_)
    print()
    for ent in doc.ents:
        print("ent", ent.text, ent.label_, ent.kb_id_)
 def capitalize_first(text):
    if not text:
        return None
    result = text[0].capitalize()
    if len(result) > 0:
        result += text[1:]
    return result
 if __name__ == "__main__":
    print("START", datetime.datetime.now())
    print()
    my_kb = None
    # one-time methods to create KB and write to file
    to_create_prior_probs = False
    to_create_entity_counts = False
    to_create_kb = False
-    to_read_kb = True
+
    # read KB back in from file
    to_read_kb = False
    to_test_kb = False
    create_wp_training = True
    # STEP 1 : create prior probabilities from WP
    # run only once !
    if to_create_prior_probs:
        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
-        _read_wikipedia_prior_probs()
+        read_wikipedia_prior_probs()
        print()
    # STEP 2 : deduce entity frequencies from WP
    # run only once !
    if to_create_entity_counts:
        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
-        _write_entity_counts()
+        write_entity_counts()
        print()
    # STEP 3 : create KB and write to file
    # run only once !
    if to_create_kb:
-        # STEP 3 : create KB
+        print("STEP 3a: to_create_kb", datetime.datetime.now())
        print("STEP 3: to_create_kb", datetime.datetime.now())
        my_nlp = spacy.load('en_core_web_sm')
        my_vocab = my_nlp.vocab
        my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
@ -571,15 +719,14 @@ if __name__ == "__main__":
        print("kb aliases:", my_kb.get_size_aliases())
        print()
-        # STEP 4 : write KB to file
+        print("STEP 3b: write KB", datetime.datetime.now())
        print("STEP 4: write KB", datetime.datetime.now())
        my_kb.dump(KB_FILE)
        my_vocab.to_disk(VOCAB_DIR)
        print()
    # STEP 4 : read KB back in from file
    if to_read_kb:
-        # STEP 5 : read KB back in from file
+        print("STEP 4: to_read_kb", datetime.datetime.now())
        print("STEP 5: to_read_kb", datetime.datetime.now())
        my_vocab = Vocab()
        my_vocab.from_disk(VOCAB_DIR)
        my_kb = KnowledgeBase(vocab=my_vocab)
@ -589,16 +736,17 @@ if __name__ == "__main__":
        print()
        # test KB
-        candidates = my_kb.get_candidates("Bush")
+        if to_test_kb:
-        for c in candidates:
+            test_kb(my_kb)
            print("entity:", c.entity_)
            print("entity freq:", c.entity_freq)
            print("alias:", c.alias_)
            print("prior prob:", c.prior_prob)
            print()
-    # STEP 6: add KB to NLP pipeline
+    # STEP 5: create a training dataset from WP
-    # print("STEP 6: use KB", datetime.datetime.now())
+    if create_wp_training:
-    # add_el(my_kb, nlp)
+        print("STEP 5: create training dataset", datetime.datetime.now())
        create_training()
    # TODO coreference resolution
    # add_coref()
    print()
    print("STOP", datetime.datetime.now())