mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			73 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			73 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# -*- coding: utf-8 -*-
 | 
						||
 | 
						||
from __future__ import unicode_literals
 | 
						||
 | 
						||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
 | 
						||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
 | 
						||
from ..char_classes import QUOTES, CURRENCY
 | 
						||
 | 
						||
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
 | 
						||
          'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
 | 
						||
          'TB T G M K км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
 | 
						||
          'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
 | 
						||
 | 
						||
 | 
						||
def merge_chars(char): return char.strip().replace(' ', '|')
 | 
						||
 | 
						||
 | 
						||
UNITS = merge_chars(_units)
 | 
						||
 | 
						||
_prefixes = (['\'\'', '§', '%', '=', r'\+[0-9]+%',  # 90%
 | 
						||
              r'\'([0-9]){2}([\-]\'([0-9]){2})*',  # '12'-13
 | 
						||
              r'\-([0-9]){1,9}\.([0-9]){1,9}',  # -12.13
 | 
						||
              r'\'([Α-Ωα-ωίϊΐόάέύϋΰήώ]+)\'',  # 'αβγ'
 | 
						||
              r'([Α-Ωα-ωίϊΐόάέύϋΰήώ]){1,3}\'',  # αβγ'
 | 
						||
              r'http://www.[A-Za-z]+\-[A-Za-z]+(\.[A-Za-z]+)+(\/[A-Za-z]+)*(\.[A-Za-z]+)*',
 | 
						||
              r'[ΈΆΊΑ-Ωα-ωίϊΐόάέύϋΰήώ]+\*',  # όνομα*
 | 
						||
              r'\$([0-9])+([\,\.]([0-9])+){0,1}',
 | 
						||
              ] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
 | 
						||
             LIST_CURRENCY + LIST_ICONS)
 | 
						||
 | 
						||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
 | 
						||
             [r'(?<=[0-9])\+',  # 12+
 | 
						||
              r'([0-9])+\'',  # 12'
 | 
						||
              r'([A-Za-z])?\'',  # a'
 | 
						||
              r'^([0-9]){1,2}\.',  # 12.
 | 
						||
              r' ([0-9]){1,2}\.',  # 12.
 | 
						||
              r'([0-9]){1}\) ',  # 12)
 | 
						||
              r'^([0-9]){1}\)$',  # 12)
 | 
						||
              r'(?<=°[FfCcKk])\.',
 | 
						||
              r'([0-9])+\&',  # 12&
 | 
						||
              r'(?<=[0-9])(?:{})'.format(CURRENCY),
 | 
						||
              r'(?<=[0-9])(?:{})'.format(UNITS),
 | 
						||
              r'(?<=[0-9{}{}(?:{})])\.'.format(ALPHA_LOWER, r'²\-\)\]\+', QUOTES),
 | 
						||
              r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER),
 | 
						||
              r'(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\-',  # όνομα-
 | 
						||
              r'(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\.',
 | 
						||
              r'^[Α-Ω]{1}\.',
 | 
						||
              r'\ [Α-Ω]{1}\.',
 | 
						||
              # πρώτος-δεύτερος , πρώτος-δεύτερος-τρίτος
 | 
						||
              r'[ΈΆΊΑΌ-Ωα-ωίϊΐόάέύϋΰήώ]+([\-]([ΈΆΊΑΌ-Ωα-ωίϊΐόάέύϋΰήώ]+))+',
 | 
						||
              r'([0-9]+)mg',  # 13mg
 | 
						||
              r'([0-9]+)\.([0-9]+)m'  # 1.2m
 | 
						||
              ])
 | 
						||
 | 
						||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
 | 
						||
            [r'(?<=[0-9])[+\/\-\*^](?=[0-9])',  # 1/2 , 1-2 , 1*2
 | 
						||
             r'([a-zA-Z]+)\/([a-zA-Z]+)\/([a-zA-Z]+)',  # name1/name2/name3
 | 
						||
             r'([0-9])+(\.([0-9]+))*([\-]([0-9])+)+',  # 10.9 , 10.9.9 , 10.9-6
 | 
						||
             r'([0-9])+[,]([0-9])+[\-]([0-9])+[,]([0-9])+',  # 10,11,12
 | 
						||
             r'([0-9])+[ης]+([\-]([0-9])+)+',  # 1ης-2
 | 
						||
             # 15/2 , 15/2/17 , 2017/2/15
 | 
						||
             r'([0-9]){1,4}[\/]([0-9]){1,2}([\/]([0-9]){0,4}){0,1}',
 | 
						||
             r'[A-Za-z]+\@[A-Za-z]+(\-[A-Za-z]+)*\.[A-Za-z]+',  # abc@cde-fgh.a
 | 
						||
             r'([a-zA-Z]+)(\-([a-zA-Z]+))+',  # abc-abc
 | 
						||
             r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
 | 
						||
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
 | 
						||
             r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
 | 
						||
             r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)])
 | 
						||
 | 
						||
TOKENIZER_PREFIXES = _prefixes
 | 
						||
TOKENIZER_SUFFIXES = _suffixes
 | 
						||
TOKENIZER_INFIXES = _infixes
 |