From e7b1ee9efdbc5ecc7fbcb289e2da63e58ef34c13 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 7 Apr 2017 15:47:36 +0200
Subject: [PATCH] Switch to regex module for URL identification

The URL detection regex was failing on input such as 0.1.2.3, as this
input triggered excessive back-tracking in the builtin re module.
The solution was to switch to the regex module, which behaves better.

Closes #913.
---
 spacy/language_data/tokenizer_exceptions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py
index f01c2fdf5..1208e1219 100644
--- a/spacy/language_data/tokenizer_exceptions.py
+++ b/spacy/language_data/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 
-import re
+import regex
 
 # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
 # A few minor mods to this regex to account for use cases represented in test_urls
@@ -45,6 +45,6 @@ _URL_PATTERN = (
     r"$"
 ).strip()
 
-TOKEN_MATCH = re.compile(_URL_PATTERN, re.UNICODE).match
+TOKEN_MATCH = regex.compile(_URL_PATTERN, regex.UNICODE).match
 
 __all__ = ['TOKEN_MATCH']