deduce entity freq from WP corpus and serialize vocab in WP test

2025-07-15 18:52:29 +03:00 · 2019-04-29 17:37:29 +02:00 · 2019-04-29 17:37:29 +02:00 · 19e8f339cb
commit 19e8f339cb
parent 387263d618
2 changed files with 171 additions and 114 deletions
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@ -1,7 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals
-"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
+from spacy.vocab import Vocab
 """
 Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
 """
 import re
 import json
@ -17,6 +20,7 @@ ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-ar
 PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
 VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
 # these will/should be matched ignoring case
@ -40,12 +44,16 @@ map_alias_to_link = dict()
 def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
    kb = KnowledgeBase(vocab=vocab)
-    id_to_title = _read_wikidata(limit=1000)
+    id_to_title = _read_wikidata_entities(limit=None)
-    title_to_id = {v:k for k,v in id_to_title.items()}
+    title_to_id = {v: k for k, v in id_to_title.items()}
    entity_list = list(id_to_title.keys())
    title_list = [id_to_title[x] for x in entity_list]
    entity_frequencies = _get_entity_frequencies(entities=title_list, to_print=False)
    _add_entities(kb,
-                  entities=id_to_title.keys(),
+                  entities=entity_list,
-                  probs=[0.4 for x in id_to_title.keys()],
+                  probs=entity_frequencies,
                  to_print=to_print)
    _add_aliases(kb,
@ -64,6 +72,38 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
    return kb
 def _get_entity_frequencies(entities, to_print=False):
    count_entities = [0 for _ in entities]
    total_count = 0
    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
        # skip header
        prior_file.readline()
        line = prior_file.readline()
        # we can read this file sequentially, it's sorted by alias, and then by count
        while line:
            splits = line.replace('\n', "").split(sep='|')
            # alias = splits[0]
            count = int(splits[1])
            entity = splits[2]
            if entity in entities:
                index = entities.index(entity)
                count_entities[index] = count_entities[index] + count
            total_count += count
            line = prior_file.readline()
    if to_print:
        for entity, count in zip(entities, count_entities):
            print("Entity count:", entity, count)
        print("Total count:", total_count)
    return [x*100 / total_count for x in count_entities]
 def _add_entities(kb, entities, probs, to_print=False):
    for entity, prob in zip(entities, probs):
        kb.add_entity(entity=entity, prob=prob)
@ -76,7 +116,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals
    wp_titles = title_to_id.keys()
    if to_print:
-        print("wp titles", wp_titles)
+        print("wp titles:", wp_titles)
    # adding aliases with prior probabilities
    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
@ -125,89 +165,100 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals
        print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
-def _read_wikidata(limit=None, to_print=False):
+def _read_wikidata_entities(limit=None, to_print=False):
-    """ Read the JSON wiki data """
+    """ Read the JSON wiki data and parse out the entities"""
    languages = {'en', 'de'}
    prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
-    sites = {'enwiki'}
+    site_filter = 'enwiki'
    entity_dict = dict()
    # parse appropriate fields - depending on what we need in the KB
    parse_properties = False
    parse_sitelinks = True
    parse_labels = False
    parse_descriptions = False
    parse_aliases = False
    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
        line = file.readline()
-        cnt = 1
+        cnt = 0
        while line and (not limit or cnt < limit):
            if cnt % 100000 == 0:
                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
            clean_line = line.strip()
            if clean_line.endswith(b","):
                clean_line = clean_line[:-1]
            if len(clean_line) > 1:
                obj = json.loads(clean_line)
-                keep = False
+                unique_id = obj["id"]
                entry_type = obj["type"]
-                # filtering records on their properties
+                if unique_id[0] == 'Q' and entry_type == "item":
-                # TODO: filter on rank:  preferred, normal or deprecated
+                    # filtering records on their properties
-                claims = obj["claims"]
+                    keep = False
-                for prop, value_set in prop_filter.items():
+                    claims = obj["claims"]
-                    claim_property = claims.get(prop, None)
+                    for prop, value_set in prop_filter.items():
-                    if claim_property:
+                        claim_property = claims.get(prop, None)
-                        for cp in claim_property:
+                        if claim_property:
-                            cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
+                            for cp in claim_property:
-                            if cp_id in value_set:
+                                cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
-                                keep = True
+                                cp_rank = cp['rank']
                                if cp_rank != "deprecated" and cp_id in value_set:
                                    keep = True
-                if keep:
+                    if keep:
-                    unique_id = obj["id"]
+                        if to_print:
-                    entry_type = obj["type"]
+                            print("ID:", unique_id)
                            print("type:", entry_type)
-                    if to_print:
+                        # parsing all properties that refer to other entities
-                        print("ID:", unique_id)
+                        if parse_properties:
-                        print("type:", entry_type)
+                            for prop, claim_property in claims.items():
                                cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
                                cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
                                if cp_values:
                                    if to_print:
                                        print("prop:", prop, cp_values)
-                    # parsing all properties that refer to other entities
+                        if parse_sitelinks:
-                    for prop, claim_property in claims.items():
+                            site_value = obj["sitelinks"].get(site_filter, None)
-                        cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
+                            if site_value:
-                        cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
+                                if to_print:
-                        if cp_values:
+                                    print(site_filter, ":", site_value['title'])
                            if to_print:
                                print("prop:", prop, cp_values)
                    entry_sites = obj["sitelinks"]
                    for site in sites:
                        site_value = entry_sites.get(site, None)
                        if site_value:
                            if to_print:
                                print(site, ":", site_value['title'])
                            if site == "enwiki":
                                entity_dict[unique_id] = site_value['title']
-                    labels = obj["labels"]
+                        if parse_labels:
-                    if labels:
+                            labels = obj["labels"]
-                        for lang in languages:
+                            if labels:
-                            lang_label = labels.get(lang, None)
+                                for lang in languages:
-                            if lang_label:
+                                    lang_label = labels.get(lang, None)
-                                if to_print:
+                                    if lang_label:
-                                    print("label (" + lang + "):", lang_label["value"])
+                                        if to_print:
                                            print("label (" + lang + "):", lang_label["value"])
-                    descriptions = obj["descriptions"]
+                        if parse_descriptions:
-                    if descriptions:
+                            descriptions = obj["descriptions"]
-                        for lang in languages:
+                            if descriptions:
-                            lang_descr = descriptions.get(lang, None)
+                                for lang in languages:
-                            if lang_descr:
+                                    lang_descr = descriptions.get(lang, None)
-                                if to_print:
+                                    if lang_descr:
-                                    print("description (" + lang + "):", lang_descr["value"])
+                                        if to_print:
                                            print("description (" + lang + "):", lang_descr["value"])
-                    aliases = obj["aliases"]
+                        if parse_aliases:
-                    if aliases:
+                            aliases = obj["aliases"]
-                        for lang in languages:
+                            if aliases:
-                            lang_aliases = aliases.get(lang, None)
+                                for lang in languages:
-                            if lang_aliases:
+                                    lang_aliases = aliases.get(lang, None)
-                                for item in lang_aliases:
+                                    if lang_aliases:
-                                    if to_print:
+                                        for item in lang_aliases:
-                                        print("alias (" + lang + "):", item["value"])
+                                            if to_print:
                                                print("alias (" + lang + "):", item["value"])
-                    if to_print:
+                        if to_print:
-                        print()
+                            print()
            line = file.readline()
            cnt += 1
@ -236,7 +287,7 @@ def _read_wikipedia_prior_probs():
        cnt = 0
        while line:
            if cnt % 5000000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines")
+                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
            clean_line = line.strip().decode("utf-8")
            matches = link_regex.findall(clean_line)
@ -394,7 +445,8 @@ def add_el(kb, nlp):
    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
           "Douglas reminds us to always bring our towel. " \
-           "The main character in Doug's novel is the man Arthur Dent, but Douglas doesn't write about George Washington."
+           "The main character in Doug's novel is the man Arthur Dent, " \
           "but Douglas doesn't write about George Washington or Homer Simpson."
    doc = nlp(text)
    print()
@ -414,48 +466,46 @@ def capitalize_first(text):
        result += text[1:]
    return result
 if __name__ == "__main__":
    to_create_prior_probs = False
    to_create_kb = True
    to_read_kb = False
    # STEP 1 : create prior probabilities from WP
    # run only once !
-    # _read_wikipedia_prior_probs()
+    if to_create_prior_probs:
        _read_wikipedia_prior_probs()
-    # STEP 2 : create KB
+    if to_create_kb:
-    # nlp = spacy.load('en_core_web_sm')
+        # STEP 2 : create KB
-    # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
+        my_nlp = spacy.load('en_core_web_sm')
        my_vocab = my_nlp.vocab
        my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
        print("kb entities:", my_kb.get_size_entities())
        print("kb aliases:", my_kb.get_size_aliases())
-    # STEP 3 : write KB to file
+        # STEP 3 : write KB to file
-    nlp1 = spacy.load('en_core_web_sm')
+        my_kb.dump(KB_FILE)
-    my_vocab = nlp1.vocab
+        my_vocab.to_disk(VOCAB_DIR)
    kb1 = KnowledgeBase(vocab=my_vocab)
-    kb1.add_entity(entity="Q53", prob=0.33)
+    if to_read_kb:
-    kb1.add_entity(entity="Q17", prob=0.1)
+        # STEP 4 : read KB back in from file
-    kb1.add_entity(entity="Q007", prob=0.7)
+        my_vocab = Vocab()
-    kb1.add_entity(entity="Q44", prob=0.4)
+        my_vocab.from_disk(VOCAB_DIR)
-    kb1.add_alias(alias="double07", entities=["Q007", "Q17"], probabilities=[0.9, 0.1])
+        my_kb = KnowledgeBase(vocab=my_vocab)
-    kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
+        my_kb.load_bulk(KB_FILE)
-    kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
+        print("kb entities:", my_kb.get_size_entities())
        print("kb aliases:", my_kb.get_size_aliases())
-    print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases())
+        # test KB
-    print("kb1 entities:", kb1.get_entity_strings())
+        candidates = my_kb.get_candidates("Bush")
-    print("kb1 aliases:", kb1.get_alias_strings())
+        for c in candidates:
            print()
            print("entity:", c.entity_)
            print("entity freq:", c.entity_freq)
            print("alias:", c.alias_)
            print("prior prob:", c.prior_prob)
-    print()
+    # STEP 5: add KB to NLP pipeline
    print("dumping kb1")
    print(KB_FILE, type(KB_FILE))
    kb1.dump(KB_FILE)
    # STEP 4 : read KB back in from file
    kb3 = KnowledgeBase(vocab=my_vocab)
    print("loading kb3")
    kb3.load_bulk(KB_FILE)
    print()
    print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases())
    print("kb3 entities:", kb3.get_entity_strings())
    print("kb3 aliases:", kb3.get_alias_strings())
    # STEP 5 : actually use the EL functionality
    # add_el(my_kb, nlp)
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@ -1,3 +1,5 @@
 import spacy
 from spacy.lang.en import English
 from ..util import make_tempdir
 from ...util import ensure_path
@ -5,17 +7,8 @@ from spacy.kb import KnowledgeBase
 def test_serialize_kb_disk(en_vocab):
    kb1 = KnowledgeBase(vocab=en_vocab)
    kb1.add_entity(entity="Q53", prob=0.33)
    kb1.add_entity(entity="Q17", prob=0.2)
    kb1.add_entity(entity="Q007", prob=0.7)
    kb1.add_entity(entity="Q44", prob=0.4)
    kb1.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
    kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
    kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
    # baseline assertions
    kb1 = _get_dummy_kb(en_vocab)
    _check_kb(kb1)
    # dumping to file & loading back in
@ -34,6 +27,20 @@ def test_serialize_kb_disk(en_vocab):
    _check_kb(kb2)
 def _get_dummy_kb(vocab):
    kb = KnowledgeBase(vocab=vocab)
    kb.add_entity(entity="Q53", prob=0.33)
    kb.add_entity(entity="Q17", prob=0.2)
    kb.add_entity(entity="Q007", prob=0.7)
    kb.add_entity(entity="Q44", prob=0.4)
    kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
    kb.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
    kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
    return kb
 def _check_kb(kb):
    # check entities
    assert kb.get_size_entities() == 4