mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	Fix tokenization of 'i.' for Danish.
This commit is contained in:
		
							parent
							
								
									726fb2d0b5
								
							
						
					
					
						commit
						ac8116510d
					
				|  | @ -1,7 +1,7 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...symbols import ORTH, LEMMA, NORM | ||||
| from ...symbols import ORTH, LEMMA, NORM, TAG, ADP, PUNCT | ||||
| 
 | ||||
| 
 | ||||
| _exc = {} | ||||
|  | @ -28,5 +28,12 @@ for orth in [ | |||
|     "t.o.m.", "vha.", ""]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| _custom_base_exc = { | ||||
|     "i.": [ | ||||
|         {ORTH: "i", LEMMA: "i", NORM: "i"}, | ||||
|         {ORTH: ".", TAG: PUNCT}] | ||||
| } | ||||
| _exc.update(_custom_base_exc) | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = _exc | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user