Merge branch 'french-tokenizer-exceptions'

This commit is contained in:
Matthew Honnibal 2017-02-27 11:44:39 +01:00
commit cc9b2b74e3

View File

@ -8,7 +8,6 @@ from ..language_data.tokenizer_exceptions import _URL_PATTERN
from ..language_data.punctuation import ALPHA_LOWER
from .punctuation import ELISION, HYPHENS
from ._tokenizer_exceptions_list import BASE_EXCEPTIONS
from ..symbols import *
@ -18,13 +17,9 @@ import re
def get_exceptions():
from ._tokenizer_exceptions_list import BASE_EXCEPTIONS
return BASE_EXCEPTIONS
# with io.open(os.path.join(os.path.dirname(__file__), 'resources/tokenizer_exceptions'),
# 'rt', encoding='utf8') as f:
# for line in f:
# yield line.strip('\n')
def upper_first_letter(text):
if len(text) == 0:
@ -217,6 +212,4 @@ REGULAR_EXP.append(_URL_PATTERN)
TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in REGULAR_EXP), re.IGNORECASE).match
#TOKENIZER_EXCEPTIONS = get_tokenizer_exceptions()
__all__ = ["get_tokenizer_exceptions", "TOKEN_MATCH"]