diff --git a/spacy/bn/language_data.py b/spacy/bn/language_data.py deleted file mode 100644 index 516d6a75b..000000000 --- a/spacy/bn/language_data.py +++ /dev/null @@ -1,27 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.language_data import strings_to_exc, update_exc -from .punctuation import * -from .stop_words import STOP_WORDS -from .tag_map import TAG_MAP as TAG_MAP_BN -from .morph_rules import MORPH_RULES -from .lemma_rules import LEMMA_RULES -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS as TOKENIZER_EXCEPTIONS_BN -from .. import language_data as base - -STOP_WORDS = set(STOP_WORDS) - -TAG_MAP = base.TAG_MAP -TAG_MAP.update(TAG_MAP_BN) - -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) -TOKENIZER_EXCEPTIONS.update(TOKENIZER_EXCEPTIONS_BN) - -TOKENIZER_PREFIXES = TOKENIZER_PREFIXES -TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES -TOKENIZER_INFIXES = TOKENIZER_INFIXES - -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TAG_MAP", "MORPH_RULES", "LEMMA_RULES", - "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py deleted file mode 100644 index 83894e553..000000000 --- a/spacy/de/language_data.py +++ /dev/null @@ -1,22 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .. import language_data as base -from ..language_data import update_exc, strings_to_exc - -from .tag_map import TAG_MAP -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY - - -TAG_MAP = dict(TAG_MAP) -STOP_WORDS = set(STOP_WORDS) - - -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) - - -__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"] diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py deleted file mode 100644 index f53e6beda..000000000 --- a/spacy/en/language_data.py +++ /dev/null @@ -1,29 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .. import language_data as base -from ..language_data import update_exc, strings_to_exc, expand_exc -from ..symbols import ORTH, LEMMA - -from .tag_map import TAG_MAP -from .word_sets import STOP_WORDS, NUM_WORDS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY -from .morph_rules import MORPH_RULES -from .lemmatizer import RULES as LEMMA_RULES -from .lemmatizer import INDEX as LEMMA_INDEX -from .lemmatizer import EXC as LEMMA_EXC - - -TAG_MAP = dict(TAG_MAP) -STOP_WORDS = set(STOP_WORDS) - - -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) -update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) - - -__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "MORPH_RULES", - "LEMMA_RULES", "LEMMA_INDEX", "LEMMA_EXC"] diff --git a/spacy/es/language_data.py b/spacy/es/language_data.py deleted file mode 100644 index 1758efefa..000000000 --- a/spacy/es/language_data.py +++ /dev/null @@ -1,55 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .. import language_data as base -from ..language_data import update_exc, strings_to_exc -from ..symbols import ORTH, LEMMA - -from .tag_map import TAG_MAP -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY - - -def get_time_exc(hours): - exc = { - "12m.": [ - {ORTH: "12"}, - {ORTH: "m.", LEMMA: "p.m."} - ] - } - - for hour in hours: - exc["%sa.m." % hour] = [ - {ORTH: hour}, - {ORTH: "a.m."} - ] - - exc["%sp.m." % hour] = [ - {ORTH: hour}, - {ORTH: "p.m."} - ] - - exc["%sam" % hour] = [ - {ORTH: hour}, - {ORTH: "am", LEMMA: "a.m."} - ] - - exc["%spm" % hour] = [ - {ORTH: hour}, - {ORTH: "pm", LEMMA: "p.m."} - ] - return exc - -TAG_MAP = dict(TAG_MAP) -STOP_WORDS = set(STOP_WORDS) - - -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) -update_exc(TOKENIZER_EXCEPTIONS, get_time_exc( - ['%d' % hour for hour in range(1, 12 + 1)])) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) - - -__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"] diff --git a/spacy/fi/language_data.py b/spacy/fi/language_data.py deleted file mode 100644 index 74f137631..000000000 --- a/spacy/fi/language_data.py +++ /dev/null @@ -1,17 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .. import language_data as base -from ..language_data import update_exc, strings_to_exc - -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS - - -STOP_WORDS = set(STOP_WORDS) - -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) - - -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py deleted file mode 100644 index e199fad60..000000000 --- a/spacy/fr/language_data.py +++ /dev/null @@ -1,11 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import get_tokenizer_exceptions, TOKEN_MATCH - - -STOP_WORDS = set(STOP_WORDS) - - -__all__ = ["STOP_WORDS", "get_tokenizer_exceptions", "TOKEN_MATCH"] diff --git a/spacy/he/language_data.py b/spacy/he/language_data.py deleted file mode 100644 index a4a657c33..000000000 --- a/spacy/he/language_data.py +++ /dev/null @@ -1,17 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals - -from .. import language_data as base -from ..language_data import update_exc, strings_to_exc - -from .stop_words import STOP_WORDS - - -STOP_WORDS = set(STOP_WORDS) - - -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) - - -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py deleted file mode 100644 index e888c677a..000000000 --- a/spacy/hu/language_data.py +++ /dev/null @@ -1,22 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.language_data import strings_to_exc, update_exc -from .punctuation import * -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import ABBREVIATIONS -from .tokenizer_exceptions import OTHER_EXC -from .. import language_data as base - -STOP_WORDS = set(STOP_WORDS) - -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) - -TOKENIZER_PREFIXES = TOKENIZER_PREFIXES -TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES -TOKENIZER_INFIXES = TOKENIZER_INFIXES - -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/it/language_data.py b/spacy/it/language_data.py deleted file mode 100644 index f9899d8d1..000000000 --- a/spacy/it/language_data.py +++ /dev/null @@ -1,17 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .. import language_data as base -from ..language_data import update_exc, strings_to_exc - -from .stop_words import STOP_WORDS - - -STOP_WORDS = set(STOP_WORDS) - - -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) - - -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py deleted file mode 100644 index 007ed2b4e..000000000 --- a/spacy/ja/language_data.py +++ /dev/null @@ -1,23 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals - - -# import base language data -from .. import language_data as base - - -# import util functions -from ..language_data import update_exc, strings_to_exc - - -# import language-specific data from files -from .tag_map import TAG_MAP -from .stop_words import STOP_WORDS - - -TAG_MAP = dict(TAG_MAP) -STOP_WORDS = set(STOP_WORDS) - - -# export -__all__ = ["TAG_MAP", "STOP_WORDS"] diff --git a/spacy/nb/language_data.py b/spacy/nb/language_data.py deleted file mode 100644 index 248b09fc7..000000000 --- a/spacy/nb/language_data.py +++ /dev/null @@ -1,28 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals - - -# import base language data -from .. import language_data as base - - -# import util functions -from ..language_data import update_exc, strings_to_exc, expand_exc - -# import language-specific data from files -#from .tag_map import TAG_MAP -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY -from .morph_rules import MORPH_RULES - -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) -#TAG_MAP = dict(TAG_MAP) -STOP_WORDS = set(STOP_WORDS) - -# customize tokenizer exceptions -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) -update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) - -# export -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "MORPH_RULES"] diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py deleted file mode 100644 index f9899d8d1..000000000 --- a/spacy/nl/language_data.py +++ /dev/null @@ -1,17 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .. import language_data as base -from ..language_data import update_exc, strings_to_exc - -from .stop_words import STOP_WORDS - - -STOP_WORDS = set(STOP_WORDS) - - -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) - - -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py deleted file mode 100644 index d96cdd38f..000000000 --- a/spacy/pt/language_data.py +++ /dev/null @@ -1,19 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .. import language_data as base -from ..language_data import update_exc, strings_to_exc - -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY - -STOP_WORDS = set(STOP_WORDS) - - -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) - - -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/sv/language_data.py b/spacy/sv/language_data.py deleted file mode 100644 index b8529724a..000000000 --- a/spacy/sv/language_data.py +++ /dev/null @@ -1,19 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .. import language_data as base -from ..language_data import update_exc, strings_to_exc - -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY - - -STOP_WORDS = set(STOP_WORDS) - -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) - - -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]