spaCy/spacy/lang/id/tokenizer_exceptions.py

71 lines
2.8 KiB
Python
Raw Normal View History

2017-07-23 18:55:05 +03:00
# coding: utf8
from __future__ import unicode_literals
2017-07-26 15:13:47 +03:00
import regex as re
2017-07-24 10:11:51 +03:00
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
2017-07-26 15:13:47 +03:00
from ..tokenizer_exceptions import URL_PATTERN
2017-07-24 10:12:34 +03:00
from ...symbols import ORTH
2017-07-24 10:11:10 +03:00
2017-07-26 15:13:47 +03:00
2017-07-24 10:11:10 +03:00
_exc = {}
2017-07-26 15:13:47 +03:00
for orth in ID_BASE_EXCEPTIONS:
2017-07-24 10:11:10 +03:00
_exc[orth] = [{ORTH: orth}]
2017-07-26 15:13:47 +03:00
orth_title = orth.title()
_exc[orth_title] = [{ORTH: orth_title}]
orth_caps = orth.upper()
_exc[orth_caps] = [{ORTH: orth_caps}]
orth_lower = orth.lower()
_exc[orth_lower] = [{ORTH: orth_lower}]
if '-' in orth:
orth_title = '-'.join([part.title() for part in orth.split('-')])
_exc[orth_title] = [{ORTH: orth_title}]
orth_caps = '-'.join([part.upper() for part in orth.split('-')])
_exc[orth_caps] = [{ORTH: orth_caps}]
2017-08-20 08:16:50 +03:00
for orth in [
"'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
"E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
"Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
"Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",
"B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",
"M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",
"M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",
"S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",
"S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",
"a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",
"dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",
"n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",
]:
_exc[orth] = [{ORTH: orth}]
2017-07-26 15:28:57 +03:00
_hyphen_prefix = """abdur abdus abou aboul abror abshar abu abubakar abul
aero agri agro ahmadi ahmed air abd abdel abdul ad adz afro al ala ali all
2017-07-26 15:13:47 +03:00
amir an antar anti ar as ash asy at ath az bekas ber best bi co di double
dual duo e eco eks el era ex full hi high i in inter intra ke kontra korona
kuartal lintas m macro makro me mem meng micro mid mikro mini multi neo nge
no non on pan pasca pe pem poli poly post pra pre pro re se self serba seri
sub super t trans u ultra un x""".split()
2017-07-26 15:13:47 +03:00
2017-07-29 14:21:32 +03:00
_hyphen_infix = """ber-an berke-an de-isasi di-kan di-kannya di-nya ke-an
ke-annya me-kan me-kannya men-kan men-kannya meng-kannya pe-an pen-an
per-an per-i se-an se-nya ter-i ter-kan ter-kannya""".split()
2017-07-26 15:13:47 +03:00
2017-07-26 15:28:57 +03:00
_hyphen_suffix = """el"""
2017-07-29 14:21:32 +03:00
_regular_exp = ['^{p}-[A-Za-z0-9]+$'.format(p=prefix) for prefix in _hyphen_prefix]
_regular_exp += ['^{0}-[A-Za-z0-9]+-{1}$'.format(*infix.split('-')) for infix in _hyphen_infix]
_regular_exp += ['^[A-Za-z0-9]+-{s}$'.format(s=suffix) for suffix in _hyphen_suffix]
2017-07-26 15:13:47 +03:00
_regular_exp.append(URL_PATTERN)
TOKENIZER_EXCEPTIONS = dict(_exc)
TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match