mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	* add punctuation to grc Add support for special editorial punctuation that is common in ancient Greek texts. Ancient Greek texts, as found in digital and print form, have been largely edited by scholars. Restorations and improvements are normally marked with special characters that need to be handled properly by the tokenizer. * add unit tests * simplify regex * move generic quotes to char classes * rename unit test * fix regex Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: svlandeg <svlandeg@github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
		
			
				
	
	
		
			47 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			47 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
 | |
| from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
 | |
| from ..char_classes import CONCAT_QUOTES
 | |
| 
 | |
| _prefixes = (
 | |
|     [
 | |
|         "†",
 | |
|         "⸏",
 | |
|     ]
 | |
|     + LIST_PUNCT
 | |
|     + LIST_ELLIPSES
 | |
|     + LIST_QUOTES
 | |
|     + LIST_CURRENCY
 | |
|     + LIST_ICONS
 | |
| )
 | |
| 
 | |
| _suffixes = (
 | |
|     LIST_PUNCT
 | |
|     + LIST_ELLIPSES
 | |
|     + LIST_QUOTES
 | |
|     + LIST_ICONS
 | |
|     + [
 | |
|         "†",
 | |
|         "⸎",
 | |
|         r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
 | |
|     ]
 | |
| )
 | |
| 
 | |
| _infixes = (
 | |
|     LIST_ELLIPSES
 | |
|     + LIST_ICONS
 | |
|     + [
 | |
|         r"(?<=[0-9])[+\-\*^](?=[0-9-])",
 | |
|         r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
 | |
|             al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
 | |
|         ),
 | |
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | |
|         r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
 | |
|         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
 | |
|         r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])—",
 | |
|     ]
 | |
| )
 | |
| 
 | |
| TOKENIZER_PREFIXES = _prefixes
 | |
| TOKENIZER_SUFFIXES = _suffixes
 | |
| TOKENIZER_INFIXES = _infixes
 |