mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Remove redundant language_data.py files in languages
Originally intended to collect all components of a language, but just made things messy. Now each component is in charge of exporting itself properly.
This commit is contained in:
parent
a627d3e3b0
commit
24606d364c
|
@ -1,27 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.language_data import strings_to_exc, update_exc
|
||||
from .punctuation import *
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP as TAG_MAP_BN
|
||||
from .morph_rules import MORPH_RULES
|
||||
from .lemma_rules import LEMMA_RULES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS as TOKENIZER_EXCEPTIONS_BN
|
||||
from .. import language_data as base
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
TAG_MAP = base.TAG_MAP
|
||||
TAG_MAP.update(TAG_MAP_BN)
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
TOKENIZER_EXCEPTIONS.update(TOKENIZER_EXCEPTIONS_BN)
|
||||
|
||||
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
|
||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TAG_MAP", "MORPH_RULES", "LEMMA_RULES",
|
||||
"TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
|
@ -1,22 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||
|
||||
|
||||
TAG_MAP = dict(TAG_MAP)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]
|
|
@ -1,29 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc, expand_exc
|
||||
from ..symbols import ORTH, LEMMA
|
||||
|
||||
from .tag_map import TAG_MAP
|
||||
from .word_sets import STOP_WORDS, NUM_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||
from .morph_rules import MORPH_RULES
|
||||
from .lemmatizer import RULES as LEMMA_RULES
|
||||
from .lemmatizer import INDEX as LEMMA_INDEX
|
||||
from .lemmatizer import EXC as LEMMA_EXC
|
||||
|
||||
|
||||
TAG_MAP = dict(TAG_MAP)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "MORPH_RULES",
|
||||
"LEMMA_RULES", "LEMMA_INDEX", "LEMMA_EXC"]
|
|
@ -1,55 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
from ..symbols import ORTH, LEMMA
|
||||
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||
|
||||
|
||||
def get_time_exc(hours):
|
||||
exc = {
|
||||
"12m.": [
|
||||
{ORTH: "12"},
|
||||
{ORTH: "m.", LEMMA: "p.m."}
|
||||
]
|
||||
}
|
||||
|
||||
for hour in hours:
|
||||
exc["%sa.m." % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "a.m."}
|
||||
]
|
||||
|
||||
exc["%sp.m." % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "p.m."}
|
||||
]
|
||||
|
||||
exc["%sam" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "am", LEMMA: "a.m."}
|
||||
]
|
||||
|
||||
exc["%spm" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "pm", LEMMA: "p.m."}
|
||||
]
|
||||
return exc
|
||||
|
||||
TAG_MAP = dict(TAG_MAP)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
|
||||
['%d' % hour for hour in range(1, 12 + 1)]))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]
|
|
@ -1,17 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
@ -1,11 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import get_tokenizer_exceptions, TOKEN_MATCH
|
||||
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
__all__ = ["STOP_WORDS", "get_tokenizer_exceptions", "TOKEN_MATCH"]
|
|
@ -1,17 +0,0 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
@ -1,22 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.language_data import strings_to_exc, update_exc
|
||||
from .punctuation import *
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import ABBREVIATIONS
|
||||
from .tokenizer_exceptions import OTHER_EXC
|
||||
from .. import language_data as base
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
|
||||
|
||||
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
|
||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
|
@ -1,17 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
@ -1,23 +0,0 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# import base language data
|
||||
from .. import language_data as base
|
||||
|
||||
|
||||
# import util functions
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
|
||||
# import language-specific data from files
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
TAG_MAP = dict(TAG_MAP)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
# export
|
||||
__all__ = ["TAG_MAP", "STOP_WORDS"]
|
|
@ -1,28 +0,0 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# import base language data
|
||||
from .. import language_data as base
|
||||
|
||||
|
||||
# import util functions
|
||||
from ..language_data import update_exc, strings_to_exc, expand_exc
|
||||
|
||||
# import language-specific data from files
|
||||
#from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||
from .morph_rules import MORPH_RULES
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
#TAG_MAP = dict(TAG_MAP)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
# customize tokenizer exceptions
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
|
||||
# export
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "MORPH_RULES"]
|
|
@ -1,17 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
@ -1,19 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
@ -1,19 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
Loading…
Reference in New Issue
Block a user