mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Currently doesn't work for Hungarian, because of conflicts with the
custom punctuation rules. Also doesn't take multi-character emoji like
👩🏽💻 into account.
		
	
			
		
			
				
	
	
		
			51 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			51 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | ||
| from __future__ import unicode_literals
 | ||
| 
 | ||
| import regex as re
 | ||
| 
 | ||
| 
 | ||
| re.DEFAULT_VERSION = re.VERSION1
 | ||
| merge_char_classes = lambda classes: '[{}]'.format('||'.join(classes))
 | ||
| split_chars = lambda char: list(char.strip().split(' '))
 | ||
| merge_chars = lambda char: char.strip().replace(' ', '|')
 | ||
| 
 | ||
| 
 | ||
| _bengali = r'[\p{L}&&\p{Bengali}]'
 | ||
| _hebrew = r'[\p{L}&&\p{Hebrew}]'
 | ||
| _latin_lower = r'[\p{Ll}&&\p{Latin}]'
 | ||
| _latin_upper = r'[\p{Lu}&&\p{Latin}]'
 | ||
| _latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
 | ||
| 
 | ||
| _upper = [_latin_upper]
 | ||
| _lower = [_latin_lower]
 | ||
| _uncased = [_bengali, _hebrew]
 | ||
| 
 | ||
| ALPHA = merge_char_classes(_upper + _lower + _uncased)
 | ||
| ALPHA_LOWER = merge_char_classes(_lower + _uncased)
 | ||
| ALPHA_UPPER = merge_char_classes(_upper + _uncased)
 | ||
| 
 | ||
| 
 | ||
| _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
 | ||
|           'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
 | ||
|           'TB T G M K')
 | ||
| _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 | ||
| _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 | ||
| _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
 | ||
| _hyphens = '- – — -- ---'
 | ||
| _other_symbols = r'[\p{So}]'
 | ||
| 
 | ||
| UNITS = merge_chars(_units)
 | ||
| CURRENCY = merge_chars(_currency)
 | ||
| QUOTES = merge_chars(_quotes)
 | ||
| PUNCT = merge_chars(_punct)
 | ||
| HYPHENS = merge_chars(_hyphens)
 | ||
| ICONS = _other_symbols
 | ||
| 
 | ||
| LIST_UNITS = split_chars(_units)
 | ||
| LIST_CURRENCY = split_chars(_currency)
 | ||
| LIST_QUOTES = split_chars(_quotes)
 | ||
| LIST_PUNCT = split_chars(_punct)
 | ||
| LIST_HYPHENS = split_chars(_hyphens)
 | ||
| LIST_ELLIPSES = [r'\.\.+', '…']
 | ||
| LIST_ICONS = [_other_symbols]
 |