Remove redundant language_data.py files in languages

Originally intended to collect all components of a language, but just
made things messy. Now each component is in charge of exporting itself
properly.
This commit is contained in:
ines 2017-05-08 15:55:29 +02:00
parent a627d3e3b0
commit 24606d364c
14 changed files with 0 additions and 323 deletions

View File

@ -1,27 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.language_data import strings_to_exc, update_exc
from .punctuation import *
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP as TAG_MAP_BN
from .morph_rules import MORPH_RULES
from .lemma_rules import LEMMA_RULES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS as TOKENIZER_EXCEPTIONS_BN
from .. import language_data as base
STOP_WORDS = set(STOP_WORDS)
TAG_MAP = base.TAG_MAP
TAG_MAP.update(TAG_MAP_BN)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
TOKENIZER_EXCEPTIONS.update(TOKENIZER_EXCEPTIONS_BN)
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
TOKENIZER_INFIXES = TOKENIZER_INFIXES
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TAG_MAP", "MORPH_RULES", "LEMMA_RULES",
"TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

View File

@ -1,22 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]

View File

@ -1,29 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc, expand_exc
from ..symbols import ORTH, LEMMA
from .tag_map import TAG_MAP
from .word_sets import STOP_WORDS, NUM_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
from .morph_rules import MORPH_RULES
from .lemmatizer import RULES as LEMMA_RULES
from .lemmatizer import INDEX as LEMMA_INDEX
from .lemmatizer import EXC as LEMMA_EXC
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", ""))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "MORPH_RULES",
"LEMMA_RULES", "LEMMA_INDEX", "LEMMA_EXC"]

View File

@ -1,55 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from ..symbols import ORTH, LEMMA
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
def get_time_exc(hours):
exc = {
"12m.": [
{ORTH: "12"},
{ORTH: "m.", LEMMA: "p.m."}
]
}
for hour in hours:
exc["%sa.m." % hour] = [
{ORTH: hour},
{ORTH: "a.m."}
]
exc["%sp.m." % hour] = [
{ORTH: hour},
{ORTH: "p.m."}
]
exc["%sam" % hour] = [
{ORTH: hour},
{ORTH: "am", LEMMA: "a.m."}
]
exc["%spm" % hour] = [
{ORTH: hour},
{ORTH: "pm", LEMMA: "p.m."}
]
return exc
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
['%d' % hour for hour in range(1, 12 + 1)]))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]

View File

@ -1,17 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -1,11 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import get_tokenizer_exceptions, TOKEN_MATCH
STOP_WORDS = set(STOP_WORDS)
__all__ = ["STOP_WORDS", "get_tokenizer_exceptions", "TOKEN_MATCH"]

View File

@ -1,17 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -1,22 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.language_data import strings_to_exc, update_exc
from .punctuation import *
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import ABBREVIATIONS
from .tokenizer_exceptions import OTHER_EXC
from .. import language_data as base
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
TOKENIZER_INFIXES = TOKENIZER_INFIXES
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

View File

@ -1,17 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -1,23 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
# import base language data
from .. import language_data as base
# import util functions
from ..language_data import update_exc, strings_to_exc
# import language-specific data from files
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
# export
__all__ = ["TAG_MAP", "STOP_WORDS"]

View File

@ -1,28 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
# import base language data
from .. import language_data as base
# import util functions
from ..language_data import update_exc, strings_to_exc, expand_exc
# import language-specific data from files
#from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
from .morph_rules import MORPH_RULES
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
#TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
# customize tokenizer exceptions
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", ""))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
# export
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "MORPH_RULES"]

View File

@ -1,17 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -1,19 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

View File

@ -1,19 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]