2016-11-02 22:02:41 +03:00
|
|
|
|
# encoding: utf8
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STOP_WORDS = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TOKENIZER_PREFIXES = map(re.escape, r'''
|
|
|
|
|
,
|
|
|
|
|
"
|
|
|
|
|
(
|
|
|
|
|
[
|
|
|
|
|
{
|
|
|
|
|
*
|
|
|
|
|
<
|
|
|
|
|
>
|
|
|
|
|
$
|
|
|
|
|
£
|
|
|
|
|
„
|
|
|
|
|
“
|
|
|
|
|
'
|
|
|
|
|
``
|
|
|
|
|
`
|
|
|
|
|
#
|
|
|
|
|
US$
|
|
|
|
|
C$
|
|
|
|
|
A$
|
|
|
|
|
a-
|
|
|
|
|
‘
|
|
|
|
|
....
|
|
|
|
|
...
|
|
|
|
|
‚
|
|
|
|
|
»
|
|
|
|
|
_
|
|
|
|
|
§
|
|
|
|
|
'''.strip().split('\n'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TOKENIZER_SUFFIXES = r'''
|
|
|
|
|
,
|
|
|
|
|
\"
|
|
|
|
|
\)
|
|
|
|
|
\]
|
|
|
|
|
\}
|
|
|
|
|
\*
|
|
|
|
|
\!
|
|
|
|
|
\?
|
|
|
|
|
%
|
|
|
|
|
\$
|
|
|
|
|
>
|
|
|
|
|
:
|
|
|
|
|
;
|
|
|
|
|
'
|
|
|
|
|
”
|
|
|
|
|
“
|
|
|
|
|
«
|
|
|
|
|
_
|
|
|
|
|
''
|
|
|
|
|
's
|
|
|
|
|
'S
|
|
|
|
|
’s
|
|
|
|
|
’S
|
|
|
|
|
’
|
|
|
|
|
‘
|
|
|
|
|
°
|
|
|
|
|
€
|
|
|
|
|
\.\.
|
|
|
|
|
\.\.\.
|
|
|
|
|
\.\.\.\.
|
|
|
|
|
(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
|
|
|
|
|
\-\-
|
|
|
|
|
´
|
|
|
|
|
(?<=[0-9])km²
|
|
|
|
|
(?<=[0-9])m²
|
|
|
|
|
(?<=[0-9])cm²
|
|
|
|
|
(?<=[0-9])mm²
|
|
|
|
|
(?<=[0-9])km³
|
|
|
|
|
(?<=[0-9])m³
|
|
|
|
|
(?<=[0-9])cm³
|
|
|
|
|
(?<=[0-9])mm³
|
|
|
|
|
(?<=[0-9])ha
|
|
|
|
|
(?<=[0-9])km
|
|
|
|
|
(?<=[0-9])m
|
|
|
|
|
(?<=[0-9])cm
|
|
|
|
|
(?<=[0-9])mm
|
|
|
|
|
(?<=[0-9])µm
|
|
|
|
|
(?<=[0-9])nm
|
|
|
|
|
(?<=[0-9])yd
|
|
|
|
|
(?<=[0-9])in
|
|
|
|
|
(?<=[0-9])ft
|
|
|
|
|
(?<=[0-9])kg
|
|
|
|
|
(?<=[0-9])g
|
|
|
|
|
(?<=[0-9])mg
|
|
|
|
|
(?<=[0-9])µg
|
|
|
|
|
(?<=[0-9])t
|
|
|
|
|
(?<=[0-9])lb
|
|
|
|
|
(?<=[0-9])oz
|
|
|
|
|
(?<=[0-9])m/s
|
|
|
|
|
(?<=[0-9])km/h
|
|
|
|
|
(?<=[0-9])mph
|
|
|
|
|
(?<=[0-9])°C
|
|
|
|
|
(?<=[0-9])°K
|
|
|
|
|
(?<=[0-9])°F
|
|
|
|
|
(?<=[0-9])hPa
|
|
|
|
|
(?<=[0-9])Pa
|
|
|
|
|
(?<=[0-9])mbar
|
|
|
|
|
(?<=[0-9])mb
|
|
|
|
|
(?<=[0-9])T
|
|
|
|
|
(?<=[0-9])G
|
|
|
|
|
(?<=[0-9])M
|
|
|
|
|
(?<=[0-9])K
|
|
|
|
|
(?<=[0-9])kb
|
|
|
|
|
'''.strip().split('\n')
|
|
|
|
|
|
|
|
|
|
|
2016-11-02 22:43:12 +03:00
|
|
|
|
TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
|
|
|
|
|
r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
|
|
|
|
|
r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
|
|
|
|
|
|
2016-11-02 22:02:41 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TOKENIZER_EXCEPTIONS = {
|
|
|
|
|
"vs.": [{"F": "vs."}],
|
|
|
|
|
|
|
|
|
|
"''": [{"F": "''"}],
|
2016-11-02 22:37:55 +03:00
|
|
|
|
"—": [{"F": "—", "L": "--", "pos": "$,"}],
|
2016-11-02 22:02:41 +03:00
|
|
|
|
|
|
|
|
|
"a.m.": [{"F": "a.m."}],
|
|
|
|
|
"p.m.": [{"F": "p.m."}],
|
|
|
|
|
|
|
|
|
|
"1a.m.": [{"F": "1"}, {"F": "a.m."}],
|
|
|
|
|
"2a.m.": [{"F": "2"}, {"F": "a.m."}],
|
|
|
|
|
"3a.m.": [{"F": "3"}, {"F": "a.m."}],
|
|
|
|
|
"4a.m.": [{"F": "4"}, {"F": "a.m."}],
|
|
|
|
|
"5a.m.": [{"F": "5"}, {"F": "a.m."}],
|
|
|
|
|
"6a.m.": [{"F": "6"}, {"F": "a.m."}],
|
|
|
|
|
"7a.m.": [{"F": "7"}, {"F": "a.m."}],
|
|
|
|
|
"8a.m.": [{"F": "8"}, {"F": "a.m."}],
|
|
|
|
|
"9a.m.": [{"F": "9"}, {"F": "a.m."}],
|
|
|
|
|
"10a.m.": [{"F": "10"}, {"F": "a.m."}],
|
|
|
|
|
"11a.m.": [{"F": "11"}, {"F": "a.m."}],
|
|
|
|
|
"12a.m.": [{"F": "12"}, {"F": "a.m."}],
|
|
|
|
|
"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
|
|
|
|
|
|
|
|
|
|
"p.m.": [{"F": "p.m."}],
|
|
|
|
|
"1p.m.": [{"F": "1"}, {"F": "p.m."}],
|
|
|
|
|
"2p.m.": [{"F": "2"}, {"F": "p.m."}],
|
|
|
|
|
"3p.m.": [{"F": "3"}, {"F": "p.m."}],
|
|
|
|
|
"4p.m.": [{"F": "4"}, {"F": "p.m."}],
|
|
|
|
|
"5p.m.": [{"F": "5"}, {"F": "p.m."}],
|
|
|
|
|
"6p.m.": [{"F": "6"}, {"F": "p.m."}],
|
|
|
|
|
"7p.m.": [{"F": "7"}, {"F": "p.m."}],
|
|
|
|
|
"8p.m.": [{"F": "8"}, {"F": "p.m."}],
|
|
|
|
|
"9p.m.": [{"F": "9"}, {"F": "p.m."}],
|
|
|
|
|
"10p.m.": [{"F": "10"}, {"F": "p.m."}],
|
|
|
|
|
"11p.m.": [{"F": "11"}, {"F": "p.m."}],
|
|
|
|
|
"12p.m.": [{"F": "12"}, {"F": "p.m."}],
|
|
|
|
|
"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
|
|
|
|
|
|
|
|
|
|
"Ala.": [{"F": "Ala."}],
|
|
|
|
|
"Ariz.": [{"F": "Ariz."}],
|
|
|
|
|
"Ark.": [{"F": "Ark."}],
|
|
|
|
|
"Calif.": [{"F": "Calif."}],
|
|
|
|
|
"Colo.": [{"F": "Colo."}],
|
|
|
|
|
"Conn.": [{"F": "Conn."}],
|
|
|
|
|
"Del.": [{"F": "Del."}],
|
|
|
|
|
"D.C.": [{"F": "D.C."}],
|
|
|
|
|
"Fla.": [{"F": "Fla."}],
|
|
|
|
|
"Ga.": [{"F": "Ga."}],
|
|
|
|
|
"Ill.": [{"F": "Ill."}],
|
|
|
|
|
"Ind.": [{"F": "Ind."}],
|
|
|
|
|
"Kans.": [{"F": "Kans."}],
|
|
|
|
|
"Kan.": [{"F": "Kan."}],
|
|
|
|
|
"Ky.": [{"F": "Ky."}],
|
|
|
|
|
"La.": [{"F": "La."}],
|
|
|
|
|
"Md.": [{"F": "Md."}],
|
|
|
|
|
"Mass.": [{"F": "Mass."}],
|
|
|
|
|
"Mich.": [{"F": "Mich."}],
|
|
|
|
|
"Minn.": [{"F": "Minn."}],
|
|
|
|
|
"Miss.": [{"F": "Miss."}],
|
|
|
|
|
"Mo.": [{"F": "Mo."}],
|
|
|
|
|
"Mont.": [{"F": "Mont."}],
|
|
|
|
|
"Nebr.": [{"F": "Nebr."}],
|
|
|
|
|
"Neb.": [{"F": "Neb."}],
|
|
|
|
|
"Nev.": [{"F": "Nev."}],
|
|
|
|
|
"N.H.": [{"F": "N.H."}],
|
|
|
|
|
"N.J.": [{"F": "N.J."}],
|
|
|
|
|
"N.M.": [{"F": "N.M."}],
|
|
|
|
|
"N.Y.": [{"F": "N.Y."}],
|
|
|
|
|
"N.C.": [{"F": "N.C."}],
|
|
|
|
|
"N.D.": [{"F": "N.D."}],
|
|
|
|
|
"Okla.": [{"F": "Okla."}],
|
|
|
|
|
"Ore.": [{"F": "Ore."}],
|
|
|
|
|
"Pa.": [{"F": "Pa."}],
|
|
|
|
|
"Tenn.": [{"F": "Tenn."}],
|
|
|
|
|
"Va.": [{"F": "Va."}],
|
|
|
|
|
"Wash.": [{"F": "Wash."}],
|
|
|
|
|
"Wis.": [{"F": "Wis."}],
|
|
|
|
|
|
|
|
|
|
":)": [{"F": ":)"}],
|
|
|
|
|
"<3": [{"F": "<3"}],
|
|
|
|
|
";)": [{"F": ";)"}],
|
|
|
|
|
"(:": [{"F": "(:"}],
|
|
|
|
|
":(": [{"F": ":("}],
|
|
|
|
|
"-_-": [{"F": "-_-"}],
|
|
|
|
|
"=)": [{"F": "=)"}],
|
|
|
|
|
":/": [{"F": ":/"}],
|
|
|
|
|
":>": [{"F": ":>"}],
|
|
|
|
|
";-)": [{"F": ";-)"}],
|
|
|
|
|
":Y": [{"F": ":Y"}],
|
|
|
|
|
":P": [{"F": ":P"}],
|
|
|
|
|
":-P": [{"F": ":-P"}],
|
|
|
|
|
":3": [{"F": ":3"}],
|
|
|
|
|
"=3": [{"F": "=3"}],
|
|
|
|
|
"xD": [{"F": "xD"}],
|
|
|
|
|
"^_^": [{"F": "^_^"}],
|
|
|
|
|
"=]": [{"F": "=]"}],
|
|
|
|
|
"=D": [{"F": "=D"}],
|
|
|
|
|
"<333": [{"F": "<333"}],
|
|
|
|
|
":))": [{"F": ":))"}],
|
|
|
|
|
":0": [{"F": ":0"}],
|
|
|
|
|
"-__-": [{"F": "-__-"}],
|
|
|
|
|
"xDD": [{"F": "xDD"}],
|
|
|
|
|
"o_o": [{"F": "o_o"}],
|
|
|
|
|
"o_O": [{"F": "o_O"}],
|
|
|
|
|
"V_V": [{"F": "V_V"}],
|
|
|
|
|
"=[[": [{"F": "=[["}],
|
|
|
|
|
"<33": [{"F": "<33"}],
|
|
|
|
|
";p": [{"F": ";p"}],
|
|
|
|
|
";D": [{"F": ";D"}],
|
|
|
|
|
";-p": [{"F": ";-p"}],
|
|
|
|
|
";(": [{"F": ";("}],
|
|
|
|
|
":p": [{"F": ":p"}],
|
|
|
|
|
":]": [{"F": ":]"}],
|
|
|
|
|
":O": [{"F": ":O"}],
|
|
|
|
|
":-/": [{"F": ":-/"}],
|
|
|
|
|
":-)": [{"F": ":-)"}],
|
|
|
|
|
":(((": [{"F": ":((("}],
|
|
|
|
|
":((": [{"F": ":(("}],
|
|
|
|
|
":')": [{"F": ":')"}],
|
|
|
|
|
"(^_^)": [{"F": "(^_^)"}],
|
|
|
|
|
"(=": [{"F": "(="}],
|
|
|
|
|
"o.O": [{"F": "o.O"}],
|
|
|
|
|
"\")": [{"F": "\")"}],
|
|
|
|
|
|
|
|
|
|
"a.": [{"F": "a."}],
|
|
|
|
|
"b.": [{"F": "b."}],
|
|
|
|
|
"c.": [{"F": "c."}],
|
|
|
|
|
"d.": [{"F": "d."}],
|
|
|
|
|
"e.": [{"F": "e."}],
|
|
|
|
|
"f.": [{"F": "f."}],
|
|
|
|
|
"g.": [{"F": "g."}],
|
|
|
|
|
"h.": [{"F": "h."}],
|
|
|
|
|
"i.": [{"F": "i."}],
|
|
|
|
|
"j.": [{"F": "j."}],
|
|
|
|
|
"k.": [{"F": "k."}],
|
|
|
|
|
"l.": [{"F": "l."}],
|
|
|
|
|
"m.": [{"F": "m."}],
|
|
|
|
|
"n.": [{"F": "n."}],
|
|
|
|
|
"o.": [{"F": "o."}],
|
|
|
|
|
"p.": [{"F": "p."}],
|
|
|
|
|
"q.": [{"F": "q."}],
|
|
|
|
|
"r.": [{"F": "r."}],
|
|
|
|
|
"s.": [{"F": "s."}],
|
|
|
|
|
"t.": [{"F": "t."}],
|
|
|
|
|
"u.": [{"F": "u."}],
|
|
|
|
|
"v.": [{"F": "v."}],
|
|
|
|
|
"w.": [{"F": "w."}],
|
|
|
|
|
"x.": [{"F": "x."}],
|
|
|
|
|
"y.": [{"F": "y."}],
|
|
|
|
|
"z.": [{"F": "z."}],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TAG_MAP = {
|
|
|
|
|
"$(": {"pos": "PUNCT", "PunctType": "Brck"},
|
|
|
|
|
"$,": {"pos": "PUNCT", "PunctType": "Comm"},
|
|
|
|
|
"$.": {"pos": "PUNCT", "PunctType": "Peri"},
|
|
|
|
|
"ADJA": {"pos": "ADJ"},
|
|
|
|
|
"ADJD": {"pos": "ADJ", "Variant": "Short"},
|
|
|
|
|
"ADV": {"pos": "ADV"},
|
|
|
|
|
"APPO": {"pos": "ADP", "AdpType": "Post"},
|
|
|
|
|
"APPR": {"pos": "ADP", "AdpType": "Prep"},
|
|
|
|
|
"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
|
|
|
|
|
"APZR": {"pos": "ADP", "AdpType": "Circ"},
|
|
|
|
|
"ART": {"pos": "DET", "PronType": "Art"},
|
|
|
|
|
"CARD": {"pos": "NUM", "NumType": "Card"},
|
|
|
|
|
"FM": {"pos": "X", "Foreign": "Yes"},
|
|
|
|
|
"ITJ": {"pos": "INTJ"},
|
|
|
|
|
"KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
|
|
|
|
|
"KON": {"pos": "CONJ"},
|
|
|
|
|
"KOUI": {"pos": "SCONJ"},
|
|
|
|
|
"KOUS": {"pos": "SCONJ"},
|
|
|
|
|
"NE": {"pos": "PROPN"},
|
|
|
|
|
"NNE": {"pos": "PROPN"},
|
|
|
|
|
"NN": {"pos": "NOUN"},
|
|
|
|
|
"PAV": {"pos": "ADV", "PronType": "Dem"},
|
|
|
|
|
"PROAV": {"pos": "ADV", "PronType": "Dem"},
|
|
|
|
|
"PDAT": {"pos": "DET", "PronType": "Dem"},
|
|
|
|
|
"PDS": {"pos": "PRON", "PronType": "Dem"},
|
|
|
|
|
"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"},
|
|
|
|
|
"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
|
|
|
|
|
"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"},
|
|
|
|
|
"PPER": {"pos": "PRON", "PronType": "Prs"},
|
|
|
|
|
"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
|
|
|
|
|
"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
|
|
|
|
|
"PRELAT": {"pos": "DET", "PronType": "Rel"},
|
|
|
|
|
"PRELS": {"pos": "PRON", "PronType": "Rel"},
|
|
|
|
|
"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
|
|
|
|
|
"PTKA": {"pos": "PART"},
|
|
|
|
|
"PTKANT": {"pos": "PART", "PartType": "Res"},
|
|
|
|
|
"PTKNEG": {"pos": "PART", "Negative": "Neg"},
|
|
|
|
|
"PTKVZ": {"pos": "PART", "PartType": "Vbp"},
|
|
|
|
|
"PTKZU": {"pos": "PART", "PartType": "Inf"},
|
|
|
|
|
"PWAT": {"pos": "DET", "PronType": "Int"},
|
|
|
|
|
"PWAV": {"pos": "ADV", "PronType": "Int"},
|
|
|
|
|
"PWS": {"pos": "PRON", "PronType": "Int"},
|
|
|
|
|
"TRUNC": {"pos": "X", "Hyph": "Yes"},
|
|
|
|
|
"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
|
|
|
|
|
"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
|
|
|
|
|
"VAINF": {"pos": "AUX", "VerbForm": "Inf"},
|
|
|
|
|
"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
|
|
|
|
|
"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
|
|
|
|
|
"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
|
|
|
|
|
"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
|
|
|
|
|
"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
|
|
|
|
|
"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
|
|
|
|
|
"VVINF": {"pos": "VERB", "VerbForm": "Inf"},
|
|
|
|
|
"VVIZU": {"pos": "VERB", "VerbForm": "Inf"},
|
|
|
|
|
"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
|
|
|
|
|
"XY": {"pos": "X"},
|
|
|
|
|
"SP": {"pos": "SPACE"}
|
|
|
|
|
}
|