spaCy/spacy/language_data/punctuation.py
Yam 978b24ccd4 Update punctuation.py
In Chinese, `~` and `——` is hyphens,   
`·` is intermittent symbol
2017-09-20 23:02:22 +08:00

112 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf8
from __future__ import unicode_literals
import regex as re
re.DEFAULT_VERSION = re.VERSION1
_UNITS = """
km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg
µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb
TB T G M K
"""
_CURRENCY = r"""
\$ £ € ¥ ฿ US\$ C\$ A\$
"""
_QUOTES = r"""
' '' " ” “ `` ` ´ , „ » «
「 」 『 』 【 】 《 》 〈 〉
"""
_PUNCT = r"""
… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
·
"""
_HYPHENS = r"""
- — -- --- —— ~
"""
LIST_ELLIPSES = [
r'\.\.+',
""
]
LIST_CURRENCY = list(_CURRENCY.strip().split())
LIST_QUOTES = list(_QUOTES.strip().split())
LIST_PUNCT = list(_PUNCT.strip().split())
LIST_HYPHENS = list(_HYPHENS.strip().split())
BENGALI = r'[\p{L}&&\p{Bengali}]'
HEBREW = r'[\p{L}&&\p{Hebrew}]'
LATIN_LOWER = r'[\p{Ll}&&\p{Latin}]'
LATIN_UPPER = r'[\p{Lu}&&\p{Latin}]'
LATIN = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
ALPHA_LOWER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_LOWER]))
ALPHA_UPPER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_UPPER]))
ALPHA = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN]))
QUOTES = _QUOTES.strip().replace(' ', '|')
CURRENCY = _CURRENCY.strip().replace(' ', '|')
UNITS = _UNITS.strip().replace(' ', '|').replace('\n', '|')
HYPHENS = _HYPHENS.strip().replace(' ', '|')
# Prefixes
TOKENIZER_PREFIXES = (
['§', '%', '=', r'\+'] +
LIST_PUNCT +
LIST_ELLIPSES +
LIST_QUOTES +
LIST_CURRENCY
)
# Suffixes
TOKENIZER_SUFFIXES = (
LIST_PUNCT +
LIST_ELLIPSES +
LIST_QUOTES +
[
r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
"'s", "'S", "s", "S"
]
)
# Infixes
TOKENIZER_INFIXES = (
LIST_ELLIPSES +
[
r'(?<=[0-9])[+\-\*^](?=[0-9-])',
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)
]
)
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]