mirror of
https://github.com/explosion/spaCy.git
synced 2024-09-24 04:49:11 +03:00
249 lines
6.6 KiB
Python
249 lines
6.6 KiB
Python
# encoding: utf8
|
||
from __future__ import unicode_literals
|
||
|
||
import os
|
||
import re
|
||
|
||
import six
|
||
|
||
|
||
def _load_txt_data(*file_paths):
|
||
for path in file_paths:
|
||
with open(path) as f:
|
||
for line in f.readlines():
|
||
if not line.strip().startswith("#"):
|
||
yield line.strip()
|
||
|
||
|
||
_MODULE_PATH = os.path.dirname(__file__)
|
||
_ABBREVIATIONS_ORIG_PATH = _MODULE_PATH + "/data/tokenizer/abbreviations_orig-hu.txt"
|
||
_ABBREVIATIONS_NYTUD_PATH = _MODULE_PATH + "/data/tokenizer/abbreviations_nytud-hu.txt"
|
||
_STOPWORDS_PATH = _MODULE_PATH + "/data/stopwords.txt"
|
||
|
||
STOP_WORDS = set(_load_txt_data(_STOPWORDS_PATH))
|
||
|
||
HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
|
||
|
||
TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... ...'''.split()
|
||
|
||
TOKENIZER_SUFFIXES = r'''
|
||
,
|
||
\"
|
||
\)
|
||
\]
|
||
\}
|
||
\*
|
||
\!
|
||
\?
|
||
%
|
||
\$
|
||
>
|
||
:
|
||
;
|
||
'
|
||
”
|
||
“
|
||
«
|
||
_
|
||
''
|
||
’
|
||
‘
|
||
°
|
||
€
|
||
\.\.
|
||
\.\.\.
|
||
\.\.\.\.
|
||
(?<=[a-züóőúéáűíAÜÓŐÚÉÁŰÍ)\]"'´«‘’%\)²“”])\.
|
||
\-\-
|
||
´
|
||
(?<=[0-9])km²
|
||
(?<=[0-9])m²
|
||
(?<=[0-9])cm²
|
||
(?<=[0-9])mm²
|
||
(?<=[0-9])km³
|
||
(?<=[0-9])m³
|
||
(?<=[0-9])cm³
|
||
(?<=[0-9])mm³
|
||
(?<=[0-9])ha
|
||
(?<=[0-9])km
|
||
(?<=[0-9])m
|
||
(?<=[0-9])cm
|
||
(?<=[0-9])mm
|
||
(?<=[0-9])µm
|
||
(?<=[0-9])nm
|
||
(?<=[0-9])yd
|
||
(?<=[0-9])in
|
||
(?<=[0-9])ft
|
||
(?<=[0-9])kg
|
||
(?<=[0-9])g
|
||
(?<=[0-9])mg
|
||
(?<=[0-9])µg
|
||
(?<=[0-9])t
|
||
(?<=[0-9])lb
|
||
(?<=[0-9])oz
|
||
(?<=[0-9])m/s
|
||
(?<=[0-9])km/h
|
||
(?<=[0-9])mph
|
||
(?<=[0-9])°C
|
||
(?<=[0-9])°K
|
||
(?<=[0-9])°F
|
||
(?<=[0-9])hPa
|
||
(?<=[0-9])Pa
|
||
(?<=[0-9])mbar
|
||
(?<=[0-9])mb
|
||
(?<=[0-9])T
|
||
(?<=[0-9])G
|
||
(?<=[0-9])M
|
||
(?<=[0-9])K
|
||
(?<=[0-9])kb
|
||
'''.strip().split('\n')
|
||
|
||
TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
|
||
r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
|
||
r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
|
||
|
||
ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in
|
||
_load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)}
|
||
|
||
TOKENIZER_EXCEPTIONS = {
|
||
"vs.": [{"F": "vs."}],
|
||
|
||
"''": [{"F": "''"}],
|
||
"—": [{"F": "—", "L": "--", "pos": "$,"}],
|
||
|
||
":)": [{"F": ":)"}],
|
||
"<3": [{"F": "<3"}],
|
||
";)": [{"F": ";)"}],
|
||
"(:": [{"F": "(:"}],
|
||
":(": [{"F": ":("}],
|
||
"-_-": [{"F": "-_-"}],
|
||
"=)": [{"F": "=)"}],
|
||
":/": [{"F": ":/"}],
|
||
":>": [{"F": ":>"}],
|
||
";-)": [{"F": ";-)"}],
|
||
":Y": [{"F": ":Y"}],
|
||
":P": [{"F": ":P"}],
|
||
":-P": [{"F": ":-P"}],
|
||
":3": [{"F": ":3"}],
|
||
"=3": [{"F": "=3"}],
|
||
"xD": [{"F": "xD"}],
|
||
"^_^": [{"F": "^_^"}],
|
||
"=]": [{"F": "=]"}],
|
||
"=D": [{"F": "=D"}],
|
||
"<333": [{"F": "<333"}],
|
||
":))": [{"F": ":))"}],
|
||
":0": [{"F": ":0"}],
|
||
"-__-": [{"F": "-__-"}],
|
||
"xDD": [{"F": "xDD"}],
|
||
"o_o": [{"F": "o_o"}],
|
||
"o_O": [{"F": "o_O"}],
|
||
"V_V": [{"F": "V_V"}],
|
||
"=[[": [{"F": "=[["}],
|
||
"<33": [{"F": "<33"}],
|
||
";p": [{"F": ";p"}],
|
||
";D": [{"F": ";D"}],
|
||
";-p": [{"F": ";-p"}],
|
||
";(": [{"F": ";("}],
|
||
":p": [{"F": ":p"}],
|
||
":]": [{"F": ":]"}],
|
||
":O": [{"F": ":O"}],
|
||
":-/": [{"F": ":-/"}],
|
||
":-)": [{"F": ":-)"}],
|
||
":(((": [{"F": ":((("}],
|
||
":((": [{"F": ":(("}],
|
||
":')": [{"F": ":')"}],
|
||
"(^_^)": [{"F": "(^_^)"}],
|
||
"(=": [{"F": "(="}],
|
||
"o.O": [{"F": "o.O"}],
|
||
"\")": [{"F": "\")"}],
|
||
|
||
"a.": [{"F": "a."}],
|
||
"b.": [{"F": "b."}],
|
||
"c.": [{"F": "c."}],
|
||
"d.": [{"F": "d."}],
|
||
"e.": [{"F": "e."}],
|
||
"f.": [{"F": "f."}],
|
||
"g.": [{"F": "g."}],
|
||
"h.": [{"F": "h."}],
|
||
"i.": [{"F": "i."}],
|
||
"j.": [{"F": "j."}],
|
||
"k.": [{"F": "k."}],
|
||
"l.": [{"F": "l."}],
|
||
"m.": [{"F": "m."}],
|
||
"n.": [{"F": "n."}],
|
||
"o.": [{"F": "o."}],
|
||
"p.": [{"F": "p."}],
|
||
"q.": [{"F": "q."}],
|
||
"r.": [{"F": "r."}],
|
||
"s.": [{"F": "s."}],
|
||
"t.": [{"F": "t."}],
|
||
"u.": [{"F": "u."}],
|
||
"v.": [{"F": "v."}],
|
||
"w.": [{"F": "w."}],
|
||
"x.": [{"F": "x."}],
|
||
"y.": [{"F": "y."}],
|
||
"z.": [{"F": "z."}],
|
||
}
|
||
|
||
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
|
||
|
||
TAG_MAP = {
|
||
"$(": {"pos": "PUNCT", "PunctType": "Brck"},
|
||
"$,": {"pos": "PUNCT", "PunctType": "Comm"},
|
||
"$.": {"pos": "PUNCT", "PunctType": "Peri"},
|
||
"ADJA": {"pos": "ADJ"},
|
||
"ADJD": {"pos": "ADJ", "Variant": "Short"},
|
||
"ADV": {"pos": "ADV"},
|
||
"APPO": {"pos": "ADP", "AdpType": "Post"},
|
||
"APPR": {"pos": "ADP", "AdpType": "Prep"},
|
||
"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
|
||
"APZR": {"pos": "ADP", "AdpType": "Circ"},
|
||
"ART": {"pos": "DET", "PronType": "Art"},
|
||
"CARD": {"pos": "NUM", "NumType": "Card"},
|
||
"FM": {"pos": "X", "Foreign": "Yes"},
|
||
"ITJ": {"pos": "INTJ"},
|
||
"KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
|
||
"KON": {"pos": "CONJ"},
|
||
"KOUI": {"pos": "SCONJ"},
|
||
"KOUS": {"pos": "SCONJ"},
|
||
"NE": {"pos": "PROPN"},
|
||
"NNE": {"pos": "PROPN"},
|
||
"NN": {"pos": "NOUN"},
|
||
"PAV": {"pos": "ADV", "PronType": "Dem"},
|
||
"PROAV": {"pos": "ADV", "PronType": "Dem"},
|
||
"PDAT": {"pos": "DET", "PronType": "Dem"},
|
||
"PDS": {"pos": "PRON", "PronType": "Dem"},
|
||
"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"},
|
||
"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
|
||
"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"},
|
||
"PPER": {"pos": "PRON", "PronType": "Prs"},
|
||
"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
|
||
"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
|
||
"PRELAT": {"pos": "DET", "PronType": "Rel"},
|
||
"PRELS": {"pos": "PRON", "PronType": "Rel"},
|
||
"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
|
||
"PTKA": {"pos": "PART"},
|
||
"PTKANT": {"pos": "PART", "PartType": "Res"},
|
||
"PTKNEG": {"pos": "PART", "Negative": "Neg"},
|
||
"PTKVZ": {"pos": "PART", "PartType": "Vbp"},
|
||
"PTKZU": {"pos": "PART", "PartType": "Inf"},
|
||
"PWAT": {"pos": "DET", "PronType": "Int"},
|
||
"PWAV": {"pos": "ADV", "PronType": "Int"},
|
||
"PWS": {"pos": "PRON", "PronType": "Int"},
|
||
"TRUNC": {"pos": "X", "Hyph": "Yes"},
|
||
"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
|
||
"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
|
||
"VAINF": {"pos": "AUX", "VerbForm": "Inf"},
|
||
"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
|
||
"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
|
||
"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
|
||
"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
|
||
"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
|
||
"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
|
||
"VVINF": {"pos": "VERB", "VerbForm": "Inf"},
|
||
"VVIZU": {"pos": "VERB", "VerbForm": "Inf"},
|
||
"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
|
||
"XY": {"pos": "X"},
|
||
"SP": {"pos": "SPACE"}
|
||
}
|