mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	run NER on clean WP text and link to gold-standard entity IDs
This commit is contained in:
		
							parent
							
								
									581dc9742d
								
							
						
					
					
						commit
						cba9680d13
					
				| 
						 | 
					@ -515,15 +515,12 @@ def add_coref():
 | 
				
			||||||
def create_training():
 | 
					def create_training():
 | 
				
			||||||
    nlp = spacy.load('en_core_web_sm')
 | 
					    nlp = spacy.load('en_core_web_sm')
 | 
				
			||||||
    wp_to_id = _get_entity_to_id()
 | 
					    wp_to_id = _get_entity_to_id()
 | 
				
			||||||
    _read_wikipedia(nlp, wp_to_id, limit=10000)
 | 
					    _read_wikipedia_texts(nlp, wp_to_id, limit=10000)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _read_wikipedia(nlp, wp_to_id, limit=None):
 | 
					def _read_wikipedia_texts(nlp, wp_to_id, limit=None):
 | 
				
			||||||
    """ Read the XML wikipedia data to parse out training data """
 | 
					    """ Read the XML wikipedia data to parse out training data """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
 | 
					 | 
				
			||||||
    # regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
 | 
					    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
 | 
				
			||||||
    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
 | 
					    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -589,18 +586,15 @@ def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text):
 | 
				
			||||||
    for alias, entity, norm in zip(aliases, entities, normalizations):
 | 
					    for alias, entity, norm in zip(aliases, entities, normalizations):
 | 
				
			||||||
        entity_id = wp_to_id.get(entity)
 | 
					        entity_id = wp_to_id.get(entity)
 | 
				
			||||||
        if entity_id:
 | 
					        if entity_id:
 | 
				
			||||||
            # print(" ", alias, '-->', entity, '-->', entity_id)
 | 
					 | 
				
			||||||
            article_dict[alias] = entity_id
 | 
					            article_dict[alias] = entity_id
 | 
				
			||||||
            article_dict[entity] = entity_id
 | 
					            article_dict[entity] = entity_id
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # get the raw text without markup etc
 | 
					    # get the raw text without markup etc
 | 
				
			||||||
    clean_text = _get_clean_wp_text(text)
 | 
					    clean_text = _get_clean_wp_text(text)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    #print(text)
 | 
					 | 
				
			||||||
    print(clean_text)
 | 
					    print(clean_text)
 | 
				
			||||||
    print()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    _run_ner(nlp, article_id, article_title, clean_text, article_dict)
 | 
					    _run_ner(nlp, article_id, article_title, clean_text, article_dict)
 | 
				
			||||||
 | 
					    print()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
info_regex = re.compile(r'{[^{]*?}')
 | 
					info_regex = re.compile(r'{[^{]*?}')
 | 
				
			||||||
| 
						 | 
					@ -676,7 +670,15 @@ def _get_clean_wp_text(article_text):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _run_ner(nlp, article_id, article_title, clean_text, article_dict):
 | 
					def _run_ner(nlp, article_id, article_title, clean_text, article_dict):
 | 
				
			||||||
    pass # TODO
 | 
					    doc = nlp(clean_text)
 | 
				
			||||||
 | 
					    for ent in doc.ents:
 | 
				
			||||||
 | 
					        if ent.label_ == "PERSON":           # TODO: expand to non-persons
 | 
				
			||||||
 | 
					            ent_id = article_dict.get(ent.text)
 | 
				
			||||||
 | 
					            if ent_id:
 | 
				
			||||||
 | 
					                print(" -", ent.text, ent.label_, ent_id)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                print(" -", ent.text, ent.label_, '???')  # TODO: investigate these cases
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    print("START", datetime.datetime.now())
 | 
					    print("START", datetime.datetime.now())
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user