reading types, claims and sitelinks

2025-08-10 07:04:53 +03:00 · 2019-04-11 21:42:44 +02:00 · 2019-04-11 21:42:44 +02:00 · b31a390a9a
commit b31a390a9a
parent 6e997be4b4
1 changed files with 20 additions and 1 deletions
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@ -30,6 +30,8 @@ def _read_wikidata():
    # TODO remove hardcoded path

    languages = {'en', 'de'}
+    properties = {'P31'}
+    sites = {'enwiki'}

    with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
        line = file.readline()
@ -40,8 +42,25 @@ def _read_wikidata():
                clean_line = clean_line[:-1]
            if len(clean_line) > 1:
                obj = json.loads(clean_line)
+
                unique_id = obj["id"]
-                print(unique_id)
+                print("ID:", unique_id)
+
+                entry_type = obj["type"]
+                print("type:", entry_type)
+
+                # TODO: filter on rank:  preferred, normal or deprecated
+                claims = obj["claims"]
+                for prop in properties:
+                    claim_property = claims.get(prop, None)
+                    if claim_property:
+                        for cp in claim_property:
+                            print(prop, cp['mainsnak']['datavalue']['value']['id'])
+
+                entry_sites = obj["sitelinks"]
+                for site in sites:
+                    site_value = entry_sites.get(site, None)
+                    print(site, ":", site_value['title'])

                labels = obj["labels"]
                if labels: