spaCy/spacy/en/language_data.py
ines f9e603903b Rename stop_words.py to word_sets.py and include more sets
NUM_WORDS and ORDINAL_WORDS are currently not used, but the hard-coded
list should be removed from orth.pyx and replaced to use
language-specific functions. This will later allow other languages to
use their own functions to set those flags. (In English, this is easier
because it only needs to be checked against a set – in German for
example, this requires a more complex function, as most number words
are one word.)
2017-03-12 13:58:22 +01:00

26 lines
829 B
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf8
from __future__ import unicode_literals
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc, expand_exc
from ..symbols import ORTH, LEMMA
from .tag_map import TAG_MAP
from .word_sets import STOP_WORDS, NUM_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
from .morph_rules import MORPH_RULES
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", ""))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "MORPH_RULES"]