From 9a8197185b733e471fa672e544fa2c8de57b991c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Apr 2019 22:37:50 +0200 Subject: [PATCH] fix alias capitalization --- examples/pipeline/wikidata_entity_linking.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index b7dba1e0d..691be7990 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -204,7 +204,9 @@ def _read_wikidata(limit=None, to_print=False): def _read_wikipedia_prior_probs(): - """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """ + """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities + The full file takes about 2h to parse 1100M lines (update printed every 5M lines) + """ # find the links link_regex = re.compile(r'\[\[[^\[\]]*\]\]') @@ -266,9 +268,10 @@ def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): # remove everything after # as this is not part of the title but refers to a specific paragraph if normalize_entity: + # wikipedia titles are always capitalized entity = capitalize_first(entity.split("#")[0]) if normalize_alias: - alias = capitalize_first(alias.split("#")[0]) + alias = alias.split("#")[0] if alias and entity: alias_dict = map_alias_to_link.get(alias, dict())