mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
Merge pull request #1411 from raphael0202/issue_1078
Resolve issue #1078 by simplifying URL pattern
This commit is contained in:
commit
ffc2fef13c
|
@ -32,11 +32,11 @@ _URL_PATTERN = (
|
||||||
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
||||||
r"|"
|
r"|"
|
||||||
# host name
|
# host name
|
||||||
r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
|
r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)"
|
||||||
# domain name
|
# domain name
|
||||||
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
|
r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*"
|
||||||
# TLD identifier
|
# TLD identifier
|
||||||
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
|
r"(?:\.(?:[a-z]{2,}))"
|
||||||
r")"
|
r")"
|
||||||
# port number
|
# port number
|
||||||
r"(?::\d{2,5})?"
|
r"(?::\d{2,5})?"
|
||||||
|
|
|
@ -33,13 +33,10 @@ URLS_SHOULD_MATCH = [
|
||||||
"http://userid:password@example.com/",
|
"http://userid:password@example.com/",
|
||||||
"http://142.42.1.1/",
|
"http://142.42.1.1/",
|
||||||
"http://142.42.1.1:8080/",
|
"http://142.42.1.1:8080/",
|
||||||
"http://⌘.ws",
|
|
||||||
"http://⌘.ws/",
|
|
||||||
"http://foo.com/blah_(wikipedia)#cite-1",
|
"http://foo.com/blah_(wikipedia)#cite-1",
|
||||||
"http://foo.com/blah_(wikipedia)_blah#cite-1",
|
"http://foo.com/blah_(wikipedia)_blah#cite-1",
|
||||||
"http://foo.com/unicode_(✪)_in_parens",
|
"http://foo.com/unicode_(✪)_in_parens",
|
||||||
"http://foo.com/(something)?after=parens",
|
"http://foo.com/(something)?after=parens",
|
||||||
"http://☺.damowmow.com/",
|
|
||||||
"http://code.google.com/events/#&product=browser",
|
"http://code.google.com/events/#&product=browser",
|
||||||
"http://j.mp",
|
"http://j.mp",
|
||||||
"ftp://foo.bar/baz",
|
"ftp://foo.bar/baz",
|
||||||
|
@ -49,14 +46,17 @@ URLS_SHOULD_MATCH = [
|
||||||
"http://a.b-c.de",
|
"http://a.b-c.de",
|
||||||
"http://223.255.255.254",
|
"http://223.255.255.254",
|
||||||
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
||||||
"http://✪df.ws/123",
|
|
||||||
"http://➡.ws/䨹",
|
|
||||||
"http://مثال.إختبار",
|
|
||||||
"http://例子.测试",
|
|
||||||
"http://उदाहरण.परीक्षा",
|
|
||||||
|
|
||||||
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"),
|
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"),
|
||||||
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"),
|
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"),
|
||||||
|
pytest.mark.xfail("http://⌘.ws"),
|
||||||
|
pytest.mark.xfail("http://⌘.ws/"),
|
||||||
|
pytest.mark.xfail("http://☺.damowmow.com/"),
|
||||||
|
pytest.mark.xfail("http://✪df.ws/123"),
|
||||||
|
pytest.mark.xfail("http://➡.ws/䨹"),
|
||||||
|
pytest.mark.xfail("http://مثال.إختبار"),
|
||||||
|
pytest.mark.xfail("http://例子.测试"),
|
||||||
|
pytest.mark.xfail("http://उदाहरण.परीक्षा"),
|
||||||
]
|
]
|
||||||
|
|
||||||
URLS_SHOULD_NOT_MATCH = [
|
URLS_SHOULD_NOT_MATCH = [
|
||||||
|
@ -83,7 +83,6 @@ URLS_SHOULD_NOT_MATCH = [
|
||||||
"http://foo.bar/foo(bar)baz quux",
|
"http://foo.bar/foo(bar)baz quux",
|
||||||
"ftps://foo.bar/",
|
"ftps://foo.bar/",
|
||||||
"http://-error-.invalid/",
|
"http://-error-.invalid/",
|
||||||
"http://-a.b.co",
|
|
||||||
"http://a.b-.co",
|
"http://a.b-.co",
|
||||||
"http://0.0.0.0",
|
"http://0.0.0.0",
|
||||||
"http://10.1.1.0",
|
"http://10.1.1.0",
|
||||||
|
@ -99,6 +98,7 @@ URLS_SHOULD_NOT_MATCH = [
|
||||||
pytest.mark.xfail("foo.com"),
|
pytest.mark.xfail("foo.com"),
|
||||||
pytest.mark.xfail("http://1.1.1.1.1"),
|
pytest.mark.xfail("http://1.1.1.1.1"),
|
||||||
pytest.mark.xfail("http://www.foo.bar./"),
|
pytest.mark.xfail("http://www.foo.bar./"),
|
||||||
|
pytest.mark.xfail("http://-a.b.co"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user