mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
reading all persons in wikidata
This commit is contained in:
parent
60b54ae8ce
commit
3629a52ede
|
@ -50,7 +50,7 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
|
|||
print("1. _read_wikidata_entities", datetime.datetime.now())
|
||||
print()
|
||||
# title_to_id = _read_wikidata_entities_regex(limit=1000)
|
||||
title_to_id = _read_wikidata_entities_json(limit=1000)
|
||||
title_to_id = _read_wikidata_entities_json(limit=None)
|
||||
|
||||
title_list = list(title_to_id.keys())
|
||||
entity_list = [title_to_id[x] for x in title_list]
|
||||
|
@ -209,7 +209,7 @@ def _read_wikidata_entities_json(limit=None, to_print=False):
|
|||
line = file.readline()
|
||||
cnt = 0
|
||||
while line and (not limit or cnt < limit):
|
||||
if cnt % 100000 == 0:
|
||||
if cnt % 500000 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
|
||||
clean_line = line.strip()
|
||||
if clean_line.endswith(b","):
|
||||
|
@ -307,7 +307,7 @@ def _read_wikidata_entities_regex_depr(limit=None, to_print=False):
|
|||
line = file.readline()
|
||||
cnt = 0
|
||||
while line and (not limit or cnt < limit):
|
||||
if cnt % 100000 == 0:
|
||||
if cnt % 500000 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
|
||||
clean_line = line.strip()
|
||||
if clean_line.endswith(b","):
|
||||
|
|
Loading…
Reference in New Issue
Block a user