# encoding: utf8
from __future__ import unicode_literals
import re


STOP_WORDS = set()


TOKENIZER_PREFIXES = map(re.escape, r'''
,
"
(
[
{
*
<
>
$
£
„
“
'
``
`
#
US$
C$
A$
a-
‘
....
...
‚
»
_
§
'''.strip().split('\n'))


TOKENIZER_SUFFIXES = r'''
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
”
“
«
_
''
's
'S
’s
’S
’
‘
°
€
\.\.
\.\.\.
\.\.\.\.
(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
\-\-
´
(?<=[0-9])km²
(?<=[0-9])m²
(?<=[0-9])cm²
(?<=[0-9])mm²
(?<=[0-9])km³
(?<=[0-9])m³
(?<=[0-9])cm³
(?<=[0-9])mm³
(?<=[0-9])ha
(?<=[0-9])km
(?<=[0-9])m
(?<=[0-9])cm
(?<=[0-9])mm
(?<=[0-9])µm
(?<=[0-9])nm
(?<=[0-9])yd
(?<=[0-9])in
(?<=[0-9])ft
(?<=[0-9])kg
(?<=[0-9])g
(?<=[0-9])mg
(?<=[0-9])µg
(?<=[0-9])t
(?<=[0-9])lb
(?<=[0-9])oz
(?<=[0-9])m/s
(?<=[0-9])km/h
(?<=[0-9])mph
(?<=[0-9])°C
(?<=[0-9])°K
(?<=[0-9])°F
(?<=[0-9])hPa
(?<=[0-9])Pa
(?<=[0-9])mbar
(?<=[0-9])mb
(?<=[0-9])T
(?<=[0-9])G
(?<=[0-9])M
(?<=[0-9])K
(?<=[0-9])kb
'''.strip().split('\n')


TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
                     r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
                     r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()



TOKENIZER_EXCEPTIONS = {
    "vs.": [{"F": "vs."}],

    "''": [{"F": "''"}],
    "—": [{"F": "—", "L": "--", "pos": "$,"}],

    "a.m.": [{"F": "a.m."}],
    "p.m.": [{"F": "p.m."}],

    "1a.m.": [{"F": "1"}, {"F": "a.m."}],
    "2a.m.": [{"F": "2"}, {"F": "a.m."}],
    "3a.m.": [{"F": "3"}, {"F": "a.m."}],
    "4a.m.": [{"F": "4"}, {"F": "a.m."}],
    "5a.m.": [{"F": "5"}, {"F": "a.m."}],
    "6a.m.": [{"F": "6"}, {"F": "a.m."}],
    "7a.m.": [{"F": "7"}, {"F": "a.m."}],
    "8a.m.": [{"F": "8"}, {"F": "a.m."}],
    "9a.m.": [{"F": "9"}, {"F": "a.m."}],
    "10a.m.": [{"F": "10"}, {"F": "a.m."}],
    "11a.m.": [{"F": "11"}, {"F": "a.m."}],
    "12a.m.": [{"F": "12"}, {"F": "a.m."}],
    "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
    "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
    "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
    "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
    "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
    "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
    "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
    "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
    "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
    "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
    "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
    "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],

    "p.m.": [{"F": "p.m."}],
    "1p.m.": [{"F": "1"}, {"F": "p.m."}],
    "2p.m.": [{"F": "2"}, {"F": "p.m."}],
    "3p.m.": [{"F": "3"}, {"F": "p.m."}],
    "4p.m.": [{"F": "4"}, {"F": "p.m."}],
    "5p.m.": [{"F": "5"}, {"F": "p.m."}],
    "6p.m.": [{"F": "6"}, {"F": "p.m."}],
    "7p.m.": [{"F": "7"}, {"F": "p.m."}],
    "8p.m.": [{"F": "8"}, {"F": "p.m."}],
    "9p.m.": [{"F": "9"}, {"F": "p.m."}],
    "10p.m.": [{"F": "10"}, {"F": "p.m."}],
    "11p.m.": [{"F": "11"}, {"F": "p.m."}],
    "12p.m.": [{"F": "12"}, {"F": "p.m."}],
    "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
    "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
    "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
    "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
    "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
    "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
    "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
    "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
    "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
    "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
    "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
    "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],

    "Ala.": [{"F": "Ala."}],
    "Ariz.": [{"F": "Ariz."}],
    "Ark.": [{"F":  "Ark."}],
    "Calif.": [{"F": "Calif."}],
    "Colo.": [{"F": "Colo."}],
    "Conn.": [{"F": "Conn."}],
    "Del.": [{"F":  "Del."}],
    "D.C.": [{"F": "D.C."}],
    "Fla.": [{"F":  "Fla."}],
    "Ga.": [{"F": "Ga."}],
    "Ill.": [{"F": "Ill."}],
    "Ind.": [{"F": "Ind."}],
    "Kans.": [{"F": "Kans."}],
    "Kan.": [{"F": "Kan."}],
    "Ky.": [{"F": "Ky."}],
    "La.": [{"F": "La."}],
    "Md.": [{"F": "Md."}],
    "Mass.": [{"F": "Mass."}],
    "Mich.": [{"F": "Mich."}],
    "Minn.": [{"F": "Minn."}],
    "Miss.": [{"F": "Miss."}],
    "Mo.": [{"F": "Mo."}],
    "Mont.": [{"F": "Mont."}],
    "Nebr.": [{"F": "Nebr."}],
    "Neb.": [{"F": "Neb."}],
    "Nev.": [{"F":  "Nev."}],
    "N.H.": [{"F": "N.H."}],
    "N.J.": [{"F": "N.J."}],
    "N.M.": [{"F": "N.M."}],
    "N.Y.": [{"F": "N.Y."}],
    "N.C.": [{"F": "N.C."}],
    "N.D.": [{"F": "N.D."}],
    "Okla.": [{"F": "Okla."}],
    "Ore.": [{"F": "Ore."}],
    "Pa.": [{"F": "Pa."}],
    "Tenn.": [{"F": "Tenn."}],
    "Va.": [{"F": "Va."}],
    "Wash.": [{"F": "Wash."}],
    "Wis.": [{"F": "Wis."}],

    ":)":  [{"F": ":)"}],
    "<3":  [{"F": "<3"}],
    ";)":  [{"F": ";)"}],
    "(:":  [{"F": "(:"}],
    ":(":  [{"F": ":("}],
    "-_-": [{"F": "-_-"}],
    "=)":  [{"F": "=)"}],
    ":/":  [{"F": ":/"}],
    ":>":  [{"F": ":>"}],
    ";-)": [{"F": ";-)"}],
    ":Y":  [{"F": ":Y"}],
    ":P":  [{"F": ":P"}],
    ":-P": [{"F": ":-P"}],
    ":3":  [{"F": ":3"}],
    "=3":  [{"F": "=3"}],
    "xD":  [{"F": "xD"}],
    "^_^": [{"F": "^_^"}],
    "=]":  [{"F": "=]"}],
    "=D":  [{"F": "=D"}],
    "<333":    [{"F": "<333"}],
    ":))": [{"F": ":))"}],
    ":0":  [{"F": ":0"}],
    "-__-":    [{"F": "-__-"}],
    "xDD": [{"F": "xDD"}],
    "o_o": [{"F": "o_o"}],
    "o_O": [{"F": "o_O"}],
    "V_V": [{"F": "V_V"}],
    "=[[": [{"F": "=[["}],
    "<33": [{"F": "<33"}],
    ";p":  [{"F": ";p"}],
    ";D":  [{"F": ";D"}],
    ";-p": [{"F": ";-p"}],
    ";(":  [{"F": ";("}],
    ":p":  [{"F": ":p"}],
    ":]":  [{"F": ":]"}],
    ":O":  [{"F": ":O"}],
    ":-/": [{"F": ":-/"}],
    ":-)": [{"F": ":-)"}],
    ":(((":    [{"F": ":((("}],
    ":((": [{"F": ":(("}],
    ":')": [{"F": ":')"}],
    "(^_^)":   [{"F": "(^_^)"}],
    "(=":  [{"F": "(="}],
    "o.O": [{"F": "o.O"}],
    "\")": [{"F": "\")"}],

    "a.": [{"F": "a."}],
    "b.": [{"F": "b."}],
    "c.": [{"F": "c."}],
    "d.": [{"F": "d."}],
    "e.": [{"F": "e."}],
    "f.": [{"F": "f."}],
    "g.": [{"F": "g."}],
    "h.": [{"F": "h."}],
    "i.": [{"F": "i."}],
    "j.": [{"F": "j."}],
    "k.": [{"F": "k."}],
    "l.": [{"F": "l."}],
    "m.": [{"F": "m."}],
    "n.": [{"F": "n."}],
    "o.": [{"F": "o."}],
    "p.": [{"F": "p."}],
    "q.": [{"F": "q."}],
    "r.": [{"F": "r."}],
    "s.": [{"F": "s."}],
    "t.": [{"F": "t."}],
    "u.": [{"F": "u."}],
    "v.": [{"F": "v."}],
    "w.": [{"F": "w."}],
    "x.": [{"F": "x."}],
    "y.": [{"F": "y."}],
    "z.": [{"F": "z."}],
}


TAG_MAP = {
"$(": {"pos": "PUNCT", "PunctType": "Brck"},
"$,": {"pos": "PUNCT", "PunctType": "Comm"},
"$.": {"pos": "PUNCT", "PunctType": "Peri"},
"ADJA":	{"pos": "ADJ"},
"ADJD":	{"pos": "ADJ", "Variant": "Short"},
"ADV":	{"pos": "ADV"},
"APPO":	{"pos": "ADP", "AdpType": "Post"},
"APPR":	{"pos": "ADP", "AdpType": "Prep"},
"APPRART":	{"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
"APZR":	{"pos": "ADP", "AdpType": "Circ"},
"ART":	{"pos": "DET", "PronType": "Art"},
"CARD":	{"pos": "NUM", "NumType": "Card"},
"FM":	{"pos": "X", "Foreign": "Yes"},
"ITJ":	{"pos": "INTJ"},
"KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
"KON": {"pos": "CONJ"},
"KOUI":	{"pos": "SCONJ"},
"KOUS":	{"pos": "SCONJ"},
"NE": {"pos": "PROPN"},
"NNE": {"pos": "PROPN"},
"NN": {"pos": "NOUN"},
"PAV": {"pos": "ADV", "PronType": "Dem"},
"PROAV": {"pos": "ADV", "PronType": "Dem"},
"PDAT":	{"pos": "DET", "PronType": "Dem"},
"PDS": {"pos": "PRON", "PronType": "Dem"},
"PIAT":	{"pos": "DET", "PronType": "Ind,Neg,Tot"},
"PIDAT":	{"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
"PIS":	{"pos": "PRON", "PronType": "Ind,Neg,Tot"},
"PPER":	{"pos": "PRON", "PronType": "Prs"},
"PPOSAT":	{"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
"PPOSS":	{"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
"PRELAT":	{"pos": "DET", "PronType": "Rel"},
"PRELS":	{"pos": "PRON", "PronType": "Rel"},
"PRF":	{"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
"PTKA":	{"pos": "PART"},
"PTKANT":	{"pos": "PART", "PartType": "Res"},
"PTKNEG":	{"pos": "PART", "Negative": "Neg"},
"PTKVZ":	{"pos": "PART", "PartType": "Vbp"},
"PTKZU":	{"pos": "PART", "PartType": "Inf"},
"PWAT":	{"pos": "DET", "PronType": "Int"},
"PWAV":	{"pos": "ADV", "PronType": "Int"},
"PWS":	{"pos": "PRON", "PronType": "Int"},
"TRUNC":	{"pos": "X", "Hyph": "Yes"},
"VAFIN":	{"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
"VAIMP":	{"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
"VAINF":	{"pos": "AUX", "VerbForm": "Inf"},
"VAPP":	{"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
"VMFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
"VMINF":	{"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
"VMPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
"VVFIN":	{"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
"VVIMP":	{"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
"VVINF":	{"pos": "VERB", "VerbForm": "Inf"},
"VVIZU":	{"pos": "VERB", "VerbForm": "Inf"},
"VVPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
"XY":	{"pos": "X"},
"SP": {"pos": "SPACE"}
}