spaCy/spacy/language_data/tokenizer_exceptions.py

12 lines
322 B
Python
Raw Normal View History

from __future__ import unicode_literals
import re
2017-01-14 17:56:41 +03:00
_URL_PATTERN = r'''
^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$
'''.strip()
2017-01-14 17:56:41 +03:00
TOKEN_MATCH = re.compile(_URL_PATTERN).match
__all__ = ['TOKEN_MATCH']