2016-12-24 02:21:00 +03:00
|
|
|
|
# encoding: utf8
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
2016-12-21 00:28:20 +03:00
|
|
|
|
TOKENIZER_PREFIXES = r'''
|
|
|
|
|
+
|
|
|
|
|
'''.strip().split('\n')
|
|
|
|
|
|
|
|
|
|
TOKENIZER_SUFFIXES = r'''
|
|
|
|
|
,
|
|
|
|
|
\"
|
|
|
|
|
\)
|
|
|
|
|
\]
|
|
|
|
|
\}
|
|
|
|
|
\*
|
|
|
|
|
\!
|
|
|
|
|
\?
|
|
|
|
|
\$
|
|
|
|
|
>
|
|
|
|
|
:
|
|
|
|
|
;
|
|
|
|
|
'
|
|
|
|
|
”
|
|
|
|
|
“
|
|
|
|
|
«
|
|
|
|
|
_
|
|
|
|
|
''
|
|
|
|
|
’
|
|
|
|
|
‘
|
|
|
|
|
€
|
|
|
|
|
\.\.
|
|
|
|
|
\.\.\.
|
|
|
|
|
\.\.\.\.
|
|
|
|
|
(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\.
|
|
|
|
|
(?<=[a-züóőúéáűí)])-e
|
|
|
|
|
\-\-
|
|
|
|
|
´
|
|
|
|
|
(?<=[0-9])\+
|
2016-12-21 01:49:35 +03:00
|
|
|
|
(?<=[a-z0-9üóőúéáűí][\)\]”"'%\)§/])\.
|
2016-12-21 00:28:20 +03:00
|
|
|
|
(?<=[0-9])km²
|
|
|
|
|
(?<=[0-9])m²
|
|
|
|
|
(?<=[0-9])cm²
|
|
|
|
|
(?<=[0-9])mm²
|
|
|
|
|
(?<=[0-9])km³
|
|
|
|
|
(?<=[0-9])m³
|
|
|
|
|
(?<=[0-9])cm³
|
|
|
|
|
(?<=[0-9])mm³
|
|
|
|
|
(?<=[0-9])ha
|
|
|
|
|
(?<=[0-9])km
|
|
|
|
|
(?<=[0-9])m
|
|
|
|
|
(?<=[0-9])cm
|
|
|
|
|
(?<=[0-9])mm
|
|
|
|
|
(?<=[0-9])µm
|
|
|
|
|
(?<=[0-9])nm
|
|
|
|
|
(?<=[0-9])yd
|
|
|
|
|
(?<=[0-9])in
|
|
|
|
|
(?<=[0-9])ft
|
|
|
|
|
(?<=[0-9])kg
|
|
|
|
|
(?<=[0-9])g
|
|
|
|
|
(?<=[0-9])mg
|
|
|
|
|
(?<=[0-9])µg
|
|
|
|
|
(?<=[0-9])t
|
|
|
|
|
(?<=[0-9])lb
|
|
|
|
|
(?<=[0-9])oz
|
|
|
|
|
(?<=[0-9])m/s
|
|
|
|
|
(?<=[0-9])km/h
|
|
|
|
|
(?<=[0-9])mph
|
2016-12-21 01:36:59 +03:00
|
|
|
|
(?<=°[FCK])\.
|
2016-12-21 00:28:20 +03:00
|
|
|
|
(?<=[0-9])hPa
|
|
|
|
|
(?<=[0-9])Pa
|
|
|
|
|
(?<=[0-9])mbar
|
|
|
|
|
(?<=[0-9])mb
|
|
|
|
|
(?<=[0-9])T
|
|
|
|
|
(?<=[0-9])G
|
|
|
|
|
(?<=[0-9])M
|
|
|
|
|
(?<=[0-9])K
|
|
|
|
|
(?<=[0-9])kb
|
|
|
|
|
'''.strip().split('\n')
|
|
|
|
|
|
|
|
|
|
TOKENIZER_INFIXES = r'''
|
|
|
|
|
…
|
|
|
|
|
\.\.+
|
|
|
|
|
(?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ])
|
|
|
|
|
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
|
|
|
|
|
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
|
|
|
|
|
(?<=[0-9])[+\-\*/^](?=[0-9])
|
|
|
|
|
(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])
|
|
|
|
|
'''.strip().split('\n')
|
|
|
|
|
|
|
|
|
|
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|