spaCy/spacy/lang/el/punctuation.py

67 lines
3.5 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from ..char_classes import QUOTES, CURRENCY
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
'TB T G M K км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
merge_chars = lambda char: char.strip().replace(' ', '|')
UNITS = merge_chars(_units)
_prefixes = (['\'\'', '§', '%', '=', r'\+[0-9]+%', # 90%
r'\'([0-9]){2}([\-]\'([0-9]){2})*', # '12'-13
r'\-([0-9]){1,9}\.([0-9]){1,9}', # -12.13
r'\'([Α-Ωα-ωίϊΐόάέύϋΰήώ]+)\'', # 'αβγ'
r'([Α-Ωα-ωίϊΐόάέύϋΰήώ]){1,3}\'', # αβγ'
r'http://www.[A-Za-z]+\-[A-Za-z]+(\.[A-Za-z]+)+(\/[A-Za-z]+)*(\.[A-Za-z]+)*',
r'[ΈΆΊΑ-Ωα-ωίϊΐόάέύϋΰήώ]+\*', # όνομα*
r'\$([0-9])+([\,\.]([0-9])+){0,1}',
] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
LIST_CURRENCY + LIST_ICONS)
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'(?<=[0-9])\+', # 12+
r'([0-9])+\'', # 12'
r'([A-Za-z])?\'', # a'
r'^([0-9]){1,2}\.', # 12.
r' ([0-9]){1,2}\.', # 12.
r'([0-9]){1}\) ', # 12)
r'^([0-9]){1}\)$', # 12)
r'(?<=°[FfCcKk])\.',
r'([0-9])+\&', # 12&
r'(?<=[0-9])(?:{})'.format(CURRENCY),
r'(?<=[0-9])(?:{})'.format(UNITS),
r'(?<=[0-9{}{}(?:{})])\.'.format(ALPHA_LOWER, r'²\-\)\]\+', QUOTES),
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER),
r'(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\-', # όνομα-
r'(?<=[Α-Ωα-ωίϊΐόάέύϋΰήώ])\.',
r'^[Α-Ω]{1}\.',
r'\ [Α-Ω]{1}\.',
r'[ΈΆΊΑΌ-Ωα-ωίϊΐόάέύϋΰήώ]+([\-]([ΈΆΊΑΌ-Ωα-ωίϊΐόάέύϋΰήώ]+))+', # πρώτος-δεύτερος , πρώτος-δεύτερος-τρίτος
r'([0-9]+)mg', # 13mg
r'([0-9]+)\.([0-9]+)m' # 1.2m
])
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[0-9])[+\/\-\*^](?=[0-9])', # 1/2 , 1-2 , 1*2
r'([a-zA-Z]+)\/([a-zA-Z]+)\/([a-zA-Z]+)', # name1/name2/name3
r'([0-9])+(\.([0-9]+))*([\-]([0-9])+)+', # 10.9 , 10.9.9 , 10.9-6
r'([0-9])+[,]([0-9])+[\-]([0-9])+[,]([0-9])+', # 10,11,12
r'([0-9])+[ης]+([\-]([0-9])+)+', # 1ης-2
r'([0-9]){1,4}[\/]([0-9]){1,2}([\/]([0-9]){0,4}){0,1}', # 15/2 , 15/2/17 , 2017/2/15
r'[A-Za-z]+\@[A-Za-z]+(\-[A-Za-z]+)*\.[A-Za-z]+', # abc@cde-fgh.a
r'([a-zA-Z]+)(\-([a-zA-Z]+))+', # abc-abc
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)])
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes