2017-03-12 15:07:28 +03:00
|
|
|
|
# coding: utf8
|
2016-12-08 21:46:43 +03:00
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
2017-04-20 02:22:52 +03:00
|
|
|
|
import regex as re
|
|
|
|
|
re.DEFAULT_VERSION = re.VERSION1
|
2017-01-08 22:37:39 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_UNITS = """
|
|
|
|
|
km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg
|
|
|
|
|
µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb
|
|
|
|
|
TB T G M K
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_CURRENCY = r"""
|
|
|
|
|
\$ £ € ¥ ฿ US\$ C\$ A\$
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_QUOTES = r"""
|
|
|
|
|
' '' " ” “ `` ` ‘ ´ ‚ , „ » «
|
2017-09-19 11:57:24 +03:00
|
|
|
|
「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉
|
2017-01-08 22:37:39 +03:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_PUNCT = r"""
|
|
|
|
|
… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
|
2017-09-20 18:02:22 +03:00
|
|
|
|
。 ? ! , 、 ; : ~ ·
|
2017-01-08 22:37:39 +03:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_HYPHENS = r"""
|
2017-09-20 18:02:22 +03:00
|
|
|
|
- – — -- --- —— ~
|
2017-01-08 22:37:39 +03:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LIST_ELLIPSES = [
|
|
|
|
|
r'\.\.+',
|
2017-09-22 04:50:46 +03:00
|
|
|
|
"… ……"
|
2017-01-08 22:37:39 +03:00
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LIST_CURRENCY = list(_CURRENCY.strip().split())
|
|
|
|
|
LIST_QUOTES = list(_QUOTES.strip().split())
|
|
|
|
|
LIST_PUNCT = list(_PUNCT.strip().split())
|
|
|
|
|
LIST_HYPHENS = list(_HYPHENS.strip().split())
|
|
|
|
|
|
|
|
|
|
|
2017-04-20 02:25:02 +03:00
|
|
|
|
BENGALI = r'[\p{L}&&\p{Bengali}]'
|
|
|
|
|
HEBREW = r'[\p{L}&&\p{Hebrew}]'
|
|
|
|
|
LATIN_LOWER = r'[\p{Ll}&&\p{Latin}]'
|
|
|
|
|
LATIN_UPPER = r'[\p{Lu}&&\p{Latin}]'
|
|
|
|
|
LATIN = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ALPHA_LOWER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_LOWER]))
|
|
|
|
|
ALPHA_UPPER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_UPPER]))
|
|
|
|
|
ALPHA = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN]))
|
2017-01-08 22:37:39 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
QUOTES = _QUOTES.strip().replace(' ', '|')
|
|
|
|
|
CURRENCY = _CURRENCY.strip().replace(' ', '|')
|
2017-01-14 17:51:59 +03:00
|
|
|
|
UNITS = _UNITS.strip().replace(' ', '|').replace('\n', '|')
|
2017-01-08 22:37:39 +03:00
|
|
|
|
HYPHENS = _HYPHENS.strip().replace(' ', '|')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Prefixes
|
|
|
|
|
|
|
|
|
|
TOKENIZER_PREFIXES = (
|
2017-02-02 18:21:11 +03:00
|
|
|
|
['§', '%', '=', r'\+'] +
|
2017-01-08 22:37:39 +03:00
|
|
|
|
LIST_PUNCT +
|
|
|
|
|
LIST_ELLIPSES +
|
|
|
|
|
LIST_QUOTES +
|
|
|
|
|
LIST_CURRENCY
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Suffixes
|
|
|
|
|
|
|
|
|
|
TOKENIZER_SUFFIXES = (
|
|
|
|
|
LIST_PUNCT +
|
|
|
|
|
LIST_ELLIPSES +
|
|
|
|
|
LIST_QUOTES +
|
|
|
|
|
[
|
|
|
|
|
r'(?<=[0-9])\+',
|
|
|
|
|
r'(?<=°[FfCcKk])\.',
|
|
|
|
|
r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
|
|
|
|
|
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
|
|
|
|
|
r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
|
2017-01-09 00:28:25 +03:00
|
|
|
|
r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
|
2017-01-08 22:37:39 +03:00
|
|
|
|
"'s", "'S", "’s", "’S"
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Infixes
|
|
|
|
|
|
|
|
|
|
TOKENIZER_INFIXES = (
|
|
|
|
|
LIST_ELLIPSES +
|
|
|
|
|
[
|
2017-01-13 00:58:26 +03:00
|
|
|
|
r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
2017-01-08 22:37:39 +03:00
|
|
|
|
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
|
|
|
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
2017-02-02 18:22:40 +03:00
|
|
|
|
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
2017-04-07 18:30:44 +03:00
|
|
|
|
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)
|
2017-01-08 22:37:39 +03:00
|
|
|
|
]
|
|
|
|
|
)
|
2016-12-08 21:46:43 +03:00
|
|
|
|
|
|
|
|
|
|
2016-12-18 17:35:56 +03:00
|
|
|
|
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|