mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 16:24:16 +03:00
Merge branch 'french-tokenizer-exceptions'
This commit is contained in:
commit
cc9b2b74e3
|
@ -8,7 +8,6 @@ from ..language_data.tokenizer_exceptions import _URL_PATTERN
|
|||
from ..language_data.punctuation import ALPHA_LOWER
|
||||
|
||||
from .punctuation import ELISION, HYPHENS
|
||||
from ._tokenizer_exceptions_list import BASE_EXCEPTIONS
|
||||
|
||||
from ..symbols import *
|
||||
|
||||
|
@ -18,13 +17,9 @@ import re
|
|||
|
||||
|
||||
def get_exceptions():
|
||||
from ._tokenizer_exceptions_list import BASE_EXCEPTIONS
|
||||
return BASE_EXCEPTIONS
|
||||
|
||||
# with io.open(os.path.join(os.path.dirname(__file__), 'resources/tokenizer_exceptions'),
|
||||
# 'rt', encoding='utf8') as f:
|
||||
# for line in f:
|
||||
# yield line.strip('\n')
|
||||
|
||||
|
||||
def upper_first_letter(text):
|
||||
if len(text) == 0:
|
||||
|
@ -217,6 +212,4 @@ REGULAR_EXP.append(_URL_PATTERN)
|
|||
|
||||
TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in REGULAR_EXP), re.IGNORECASE).match
|
||||
|
||||
#TOKENIZER_EXCEPTIONS = get_tokenizer_exceptions()
|
||||
|
||||
__all__ = ["get_tokenizer_exceptions", "TOKEN_MATCH"]
|
||||
|
|
Loading…
Reference in New Issue
Block a user