mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-29 17:33:10 +03:00
Switch to regex module for URL identification
The URL detection regex was failing on input such as 0.1.2.3, as this input triggered excessive back-tracking in the builtin re module. The solution was to switch to the regex module, which behaves better. Closes #913.
This commit is contained in:
parent
5887383fc0
commit
e7b1ee9efd
|
@ -1,6 +1,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import regex
|
||||||
|
|
||||||
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
||||||
# A few minor mods to this regex to account for use cases represented in test_urls
|
# A few minor mods to this regex to account for use cases represented in test_urls
|
||||||
|
@ -45,6 +45,6 @@ _URL_PATTERN = (
|
||||||
r"$"
|
r"$"
|
||||||
).strip()
|
).strip()
|
||||||
|
|
||||||
TOKEN_MATCH = re.compile(_URL_PATTERN, re.UNICODE).match
|
TOKEN_MATCH = regex.compile(_URL_PATTERN, regex.UNICODE).match
|
||||||
|
|
||||||
__all__ = ['TOKEN_MATCH']
|
__all__ = ['TOKEN_MATCH']
|
||||||
|
|
Loading…
Reference in New Issue
Block a user