mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-24 12:41:23 +03:00
Merge branch 'french-tokenizer-exceptions'
This commit is contained in:
commit
cc9b2b74e3
|
@ -8,7 +8,6 @@ from ..language_data.tokenizer_exceptions import _URL_PATTERN
|
||||||
from ..language_data.punctuation import ALPHA_LOWER
|
from ..language_data.punctuation import ALPHA_LOWER
|
||||||
|
|
||||||
from .punctuation import ELISION, HYPHENS
|
from .punctuation import ELISION, HYPHENS
|
||||||
from ._tokenizer_exceptions_list import BASE_EXCEPTIONS
|
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import *
|
||||||
|
|
||||||
|
@ -18,13 +17,9 @@ import re
|
||||||
|
|
||||||
|
|
||||||
def get_exceptions():
|
def get_exceptions():
|
||||||
|
from ._tokenizer_exceptions_list import BASE_EXCEPTIONS
|
||||||
return BASE_EXCEPTIONS
|
return BASE_EXCEPTIONS
|
||||||
|
|
||||||
# with io.open(os.path.join(os.path.dirname(__file__), 'resources/tokenizer_exceptions'),
|
|
||||||
# 'rt', encoding='utf8') as f:
|
|
||||||
# for line in f:
|
|
||||||
# yield line.strip('\n')
|
|
||||||
|
|
||||||
|
|
||||||
def upper_first_letter(text):
|
def upper_first_letter(text):
|
||||||
if len(text) == 0:
|
if len(text) == 0:
|
||||||
|
@ -217,6 +212,4 @@ REGULAR_EXP.append(_URL_PATTERN)
|
||||||
|
|
||||||
TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in REGULAR_EXP), re.IGNORECASE).match
|
TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in REGULAR_EXP), re.IGNORECASE).match
|
||||||
|
|
||||||
#TOKENIZER_EXCEPTIONS = get_tokenizer_exceptions()
|
|
||||||
|
|
||||||
__all__ = ["get_tokenizer_exceptions", "TOKEN_MATCH"]
|
__all__ = ["get_tokenizer_exceptions", "TOKEN_MATCH"]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user