mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			28 lines
		
	
	
		
			940 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			28 lines
		
	
	
		
			940 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| '''Demonstrate adding a rule-based component that forces some tokens to not
 | |
| be entities, before the NER tagger is applied. This is used to hotfix the issue
 | |
| in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
 | |
| '''
 | |
| import spacy
 | |
| from spacy.attrs import ENT_IOB
 | |
| 
 | |
| def fix_space_tags(doc):
 | |
|     ent_iobs = doc.to_array([ENT_IOB])
 | |
|     for i, token in enumerate(doc):
 | |
|         if token.is_space:
 | |
|             # Sets 'O' tag (0 is None, so I is 1, O is 2)
 | |
|             ent_iobs[i] = 2
 | |
|     doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
 | |
|     return doc
 | |
| 
 | |
| def main():
 | |
|     nlp = spacy.load('en_core_web_sm')
 | |
|     text = u'''This is some crazy test where I dont need an Apple                Watch to make things bug'''
 | |
|     doc = nlp(text)
 | |
|     print('Before', doc.ents)
 | |
|     nlp.add_pipe(fix_space_tags, name='fix-ner', before='ner')
 | |
|     doc = nlp(text)
 | |
|     print('After', doc.ents)
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     main()
 |