mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Remove redundant language_data.py files in languages
Originally intended to collect all components of a language, but just made things messy. Now each component is in charge of exporting itself properly.
This commit is contained in:
parent
a627d3e3b0
commit
24606d364c
|
@ -1,27 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.language_data import strings_to_exc, update_exc
|
|
||||||
from .punctuation import *
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .tag_map import TAG_MAP as TAG_MAP_BN
|
|
||||||
from .morph_rules import MORPH_RULES
|
|
||||||
from .lemma_rules import LEMMA_RULES
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS as TOKENIZER_EXCEPTIONS_BN
|
|
||||||
from .. import language_data as base
|
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
TAG_MAP = base.TAG_MAP
|
|
||||||
TAG_MAP.update(TAG_MAP_BN)
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
||||||
TOKENIZER_EXCEPTIONS.update(TOKENIZER_EXCEPTIONS_BN)
|
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
|
|
||||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
|
||||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TAG_MAP", "MORPH_RULES", "LEMMA_RULES",
|
|
||||||
"TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
|
|
@ -1,22 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .. import language_data as base
|
|
||||||
from ..language_data import update_exc, strings_to_exc
|
|
||||||
|
|
||||||
from .tag_map import TAG_MAP
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = dict(TAG_MAP)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]
|
|
|
@ -1,29 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .. import language_data as base
|
|
||||||
from ..language_data import update_exc, strings_to_exc, expand_exc
|
|
||||||
from ..symbols import ORTH, LEMMA
|
|
||||||
|
|
||||||
from .tag_map import TAG_MAP
|
|
||||||
from .word_sets import STOP_WORDS, NUM_WORDS
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
|
||||||
from .morph_rules import MORPH_RULES
|
|
||||||
from .lemmatizer import RULES as LEMMA_RULES
|
|
||||||
from .lemmatizer import INDEX as LEMMA_INDEX
|
|
||||||
from .lemmatizer import EXC as LEMMA_EXC
|
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = dict(TAG_MAP)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "MORPH_RULES",
|
|
||||||
"LEMMA_RULES", "LEMMA_INDEX", "LEMMA_EXC"]
|
|
|
@ -1,55 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .. import language_data as base
|
|
||||||
from ..language_data import update_exc, strings_to_exc
|
|
||||||
from ..symbols import ORTH, LEMMA
|
|
||||||
|
|
||||||
from .tag_map import TAG_MAP
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
|
||||||
|
|
||||||
|
|
||||||
def get_time_exc(hours):
|
|
||||||
exc = {
|
|
||||||
"12m.": [
|
|
||||||
{ORTH: "12"},
|
|
||||||
{ORTH: "m.", LEMMA: "p.m."}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
for hour in hours:
|
|
||||||
exc["%sa.m." % hour] = [
|
|
||||||
{ORTH: hour},
|
|
||||||
{ORTH: "a.m."}
|
|
||||||
]
|
|
||||||
|
|
||||||
exc["%sp.m." % hour] = [
|
|
||||||
{ORTH: hour},
|
|
||||||
{ORTH: "p.m."}
|
|
||||||
]
|
|
||||||
|
|
||||||
exc["%sam" % hour] = [
|
|
||||||
{ORTH: hour},
|
|
||||||
{ORTH: "am", LEMMA: "a.m."}
|
|
||||||
]
|
|
||||||
|
|
||||||
exc["%spm" % hour] = [
|
|
||||||
{ORTH: hour},
|
|
||||||
{ORTH: "pm", LEMMA: "p.m."}
|
|
||||||
]
|
|
||||||
return exc
|
|
||||||
|
|
||||||
TAG_MAP = dict(TAG_MAP)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(
|
|
||||||
['%d' % hour for hour in range(1, 12 + 1)]))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]
|
|
|
@ -1,17 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .. import language_data as base
|
|
||||||
from ..language_data import update_exc, strings_to_exc
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
|
@ -1,11 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .tokenizer_exceptions import get_tokenizer_exceptions, TOKEN_MATCH
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["STOP_WORDS", "get_tokenizer_exceptions", "TOKEN_MATCH"]
|
|
|
@ -1,17 +0,0 @@
|
||||||
# encoding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .. import language_data as base
|
|
||||||
from ..language_data import update_exc, strings_to_exc
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
|
@ -1,22 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.language_data import strings_to_exc, update_exc
|
|
||||||
from .punctuation import *
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .tokenizer_exceptions import ABBREVIATIONS
|
|
||||||
from .tokenizer_exceptions import OTHER_EXC
|
|
||||||
from .. import language_data as base
|
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
|
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
|
|
||||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
|
||||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
|
|
@ -1,17 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .. import language_data as base
|
|
||||||
from ..language_data import update_exc, strings_to_exc
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
|
@ -1,23 +0,0 @@
|
||||||
# encoding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# import base language data
|
|
||||||
from .. import language_data as base
|
|
||||||
|
|
||||||
|
|
||||||
# import util functions
|
|
||||||
from ..language_data import update_exc, strings_to_exc
|
|
||||||
|
|
||||||
|
|
||||||
# import language-specific data from files
|
|
||||||
from .tag_map import TAG_MAP
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = dict(TAG_MAP)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
||||||
# export
|
|
||||||
__all__ = ["TAG_MAP", "STOP_WORDS"]
|
|
|
@ -1,28 +0,0 @@
|
||||||
# encoding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# import base language data
|
|
||||||
from .. import language_data as base
|
|
||||||
|
|
||||||
|
|
||||||
# import util functions
|
|
||||||
from ..language_data import update_exc, strings_to_exc, expand_exc
|
|
||||||
|
|
||||||
# import language-specific data from files
|
|
||||||
#from .tag_map import TAG_MAP
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
|
||||||
from .morph_rules import MORPH_RULES
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
#TAG_MAP = dict(TAG_MAP)
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
# customize tokenizer exceptions
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
|
||||||
|
|
||||||
# export
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "MORPH_RULES"]
|
|
|
@ -1,17 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .. import language_data as base
|
|
||||||
from ..language_data import update_exc, strings_to_exc
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
|
@ -1,19 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .. import language_data as base
|
|
||||||
from ..language_data import update_exc, strings_to_exc
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
|
@ -1,19 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .. import language_data as base
|
|
||||||
from ..language_data import update_exc, strings_to_exc
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
|
Loading…
Reference in New Issue
Block a user