Resolve issue #1078 by simplifying URL pattern

- avoid catastrophic backtracking
- reduce character range of host name, domain name and TLD identifier
This commit is contained in:
Raphaël Bournhonesque 2017-10-11 11:24:00 +02:00
parent 331d338b8b
commit 3452d6ce52
2 changed files with 12 additions and 12 deletions

View File

@ -32,11 +32,11 @@ _URL_PATTERN = (
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
r"|" r"|"
# host name # host name
r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)" r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)"
# domain name # domain name
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*" r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*"
# TLD identifier # TLD identifier
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r"(?:\.(?:[a-z]{2,}))"
r")" r")"
# port number # port number
r"(?::\d{2,5})?" r"(?::\d{2,5})?"

View File

@ -33,13 +33,10 @@ URLS_SHOULD_MATCH = [
"http://userid:password@example.com/", "http://userid:password@example.com/",
"http://142.42.1.1/", "http://142.42.1.1/",
"http://142.42.1.1:8080/", "http://142.42.1.1:8080/",
"http://⌘.ws",
"http://⌘.ws/",
"http://foo.com/blah_(wikipedia)#cite-1", "http://foo.com/blah_(wikipedia)#cite-1",
"http://foo.com/blah_(wikipedia)_blah#cite-1", "http://foo.com/blah_(wikipedia)_blah#cite-1",
"http://foo.com/unicode_(✪)_in_parens", "http://foo.com/unicode_(✪)_in_parens",
"http://foo.com/(something)?after=parens", "http://foo.com/(something)?after=parens",
"http://☺.damowmow.com/",
"http://code.google.com/events/#&product=browser", "http://code.google.com/events/#&product=browser",
"http://j.mp", "http://j.mp",
"ftp://foo.bar/baz", "ftp://foo.bar/baz",
@ -49,14 +46,17 @@ URLS_SHOULD_MATCH = [
"http://a.b-c.de", "http://a.b-c.de",
"http://223.255.255.254", "http://223.255.255.254",
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
"http://✪df.ws/123",
"http://➡.ws/䨹",
"http://مثال.إختبار",
"http://例子.测试",
"http://उदाहरण.परीक्षा",
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"), pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"),
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"), pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"),
pytest.mark.xfail("http://⌘.ws"),
pytest.mark.xfail("http://⌘.ws/"),
pytest.mark.xfail("http://☺.damowmow.com/"),
pytest.mark.xfail("http://✪df.ws/123"),
pytest.mark.xfail("http://➡.ws/䨹"),
pytest.mark.xfail("http://مثال.إختبار"),
pytest.mark.xfail("http://例子.测试"),
pytest.mark.xfail("http://उदाहरण.परीक्षा"),
] ]
URLS_SHOULD_NOT_MATCH = [ URLS_SHOULD_NOT_MATCH = [
@ -83,7 +83,6 @@ URLS_SHOULD_NOT_MATCH = [
"http://foo.bar/foo(bar)baz quux", "http://foo.bar/foo(bar)baz quux",
"ftps://foo.bar/", "ftps://foo.bar/",
"http://-error-.invalid/", "http://-error-.invalid/",
"http://-a.b.co",
"http://a.b-.co", "http://a.b-.co",
"http://0.0.0.0", "http://0.0.0.0",
"http://10.1.1.0", "http://10.1.1.0",
@ -99,6 +98,7 @@ URLS_SHOULD_NOT_MATCH = [
pytest.mark.xfail("foo.com"), pytest.mark.xfail("foo.com"),
pytest.mark.xfail("http://1.1.1.1.1"), pytest.mark.xfail("http://1.1.1.1.1"),
pytest.mark.xfail("http://www.foo.bar./"), pytest.mark.xfail("http://www.foo.bar./"),
pytest.mark.xfail("http://-a.b.co"),
] ]