From 3452d6ce521943fb0bb02f59d3d9e3a1bac218c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Wed, 11 Oct 2017 11:24:00 +0200 Subject: [PATCH] Resolve issue #1078 by simplifying URL pattern - avoid catastrophic backtracking - reduce character range of host name, domain name and TLD identifier --- spacy/language_data/tokenizer_exceptions.py | 6 +++--- spacy/tests/tokenizer/test_urls.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py index b84adb2c4..9d5187d83 100644 --- a/spacy/language_data/tokenizer_exceptions.py +++ b/spacy/language_data/tokenizer_exceptions.py @@ -32,11 +32,11 @@ _URL_PATTERN = ( r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name - r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)" + r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)" # domain name - r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*" + r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*" # TLD identifier - r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" + r"(?:\.(?:[a-z]{2,}))" r")" # port number r"(?::\d{2,5})?" diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 959067110..3bb6521f1 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -33,13 +33,10 @@ URLS_SHOULD_MATCH = [ "http://userid:password@example.com/", "http://142.42.1.1/", "http://142.42.1.1:8080/", - "http://⌘.ws", - "http://⌘.ws/", "http://foo.com/blah_(wikipedia)#cite-1", "http://foo.com/blah_(wikipedia)_blah#cite-1", "http://foo.com/unicode_(✪)_in_parens", "http://foo.com/(something)?after=parens", - "http://☺.damowmow.com/", "http://code.google.com/events/#&product=browser", "http://j.mp", "ftp://foo.bar/baz", @@ -49,14 +46,17 @@ URLS_SHOULD_MATCH = [ "http://a.b-c.de", "http://223.255.255.254", "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 - "http://✪df.ws/123", - "http://➡.ws/䨹", - "http://مثال.إختبار", - "http://例子.测试", - "http://उदाहरण.परीक्षा", pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"), pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"), + pytest.mark.xfail("http://⌘.ws"), + pytest.mark.xfail("http://⌘.ws/"), + pytest.mark.xfail("http://☺.damowmow.com/"), + pytest.mark.xfail("http://✪df.ws/123"), + pytest.mark.xfail("http://➡.ws/䨹"), + pytest.mark.xfail("http://مثال.إختبار"), + pytest.mark.xfail("http://例子.测试"), + pytest.mark.xfail("http://उदाहरण.परीक्षा"), ] URLS_SHOULD_NOT_MATCH = [ @@ -83,7 +83,6 @@ URLS_SHOULD_NOT_MATCH = [ "http://foo.bar/foo(bar)baz quux", "ftps://foo.bar/", "http://-error-.invalid/", - "http://-a.b.co", "http://a.b-.co", "http://0.0.0.0", "http://10.1.1.0", @@ -99,6 +98,7 @@ URLS_SHOULD_NOT_MATCH = [ pytest.mark.xfail("foo.com"), pytest.mark.xfail("http://1.1.1.1.1"), pytest.mark.xfail("http://www.foo.bar./"), + pytest.mark.xfail("http://-a.b.co"), ]