mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
reading types, claims and sitelinks
This commit is contained in:
parent
6e997be4b4
commit
b31a390a9a
|
@ -30,6 +30,8 @@ def _read_wikidata():
|
||||||
# TODO remove hardcoded path
|
# TODO remove hardcoded path
|
||||||
|
|
||||||
languages = {'en', 'de'}
|
languages = {'en', 'de'}
|
||||||
|
properties = {'P31'}
|
||||||
|
sites = {'enwiki'}
|
||||||
|
|
||||||
with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
|
with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
|
||||||
line = file.readline()
|
line = file.readline()
|
||||||
|
@ -40,8 +42,25 @@ def _read_wikidata():
|
||||||
clean_line = clean_line[:-1]
|
clean_line = clean_line[:-1]
|
||||||
if len(clean_line) > 1:
|
if len(clean_line) > 1:
|
||||||
obj = json.loads(clean_line)
|
obj = json.loads(clean_line)
|
||||||
|
|
||||||
unique_id = obj["id"]
|
unique_id = obj["id"]
|
||||||
print(unique_id)
|
print("ID:", unique_id)
|
||||||
|
|
||||||
|
entry_type = obj["type"]
|
||||||
|
print("type:", entry_type)
|
||||||
|
|
||||||
|
# TODO: filter on rank: preferred, normal or deprecated
|
||||||
|
claims = obj["claims"]
|
||||||
|
for prop in properties:
|
||||||
|
claim_property = claims.get(prop, None)
|
||||||
|
if claim_property:
|
||||||
|
for cp in claim_property:
|
||||||
|
print(prop, cp['mainsnak']['datavalue']['value']['id'])
|
||||||
|
|
||||||
|
entry_sites = obj["sitelinks"]
|
||||||
|
for site in sites:
|
||||||
|
site_value = entry_sites.get(site, None)
|
||||||
|
print(site, ":", site_value['title'])
|
||||||
|
|
||||||
labels = obj["labels"]
|
labels = obj["labels"]
|
||||||
if labels:
|
if labels:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user