mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Match private networks as URLs (#11121)
This commit is contained in:
parent
5d54c0e32a
commit
551e73ccfc
|
@ -17,10 +17,6 @@ URL_PATTERN = (
|
|||
r"(?:\S+(?::\S*)?@)?"
|
||||
r"(?:"
|
||||
# IP address exclusion
|
||||
# private & local networks
|
||||
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
||||
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
||||
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
||||
# IP address dotted notation octets
|
||||
# excludes loopback network 0.0.0.0
|
||||
# excludes reserved space >= 224.0.0.0
|
||||
|
|
|
@ -33,6 +33,9 @@ URLS_SHOULD_MATCH = [
|
|||
"http://userid:password@example.com/",
|
||||
"http://142.42.1.1/",
|
||||
"http://142.42.1.1:8080/",
|
||||
"http://10.140.12.13/foo",
|
||||
"http://10.140.12.13/foo/bar?arg1=baz&arg2=taz",
|
||||
"http://10.1.1.1",
|
||||
"http://foo.com/blah_(wikipedia)#cite-1",
|
||||
"http://foo.com/blah_(wikipedia)_blah#cite-1",
|
||||
"http://foo.com/unicode_(✪)_in_parens",
|
||||
|
@ -94,6 +97,7 @@ URLS_SHOULD_NOT_MATCH = [
|
|||
"http://foo.bar/foo(bar)baz quux",
|
||||
"http://-error-.invalid/",
|
||||
"http://a.b-.co",
|
||||
# Loopback and broadcast addresses should be excluded
|
||||
"http://0.0.0.0",
|
||||
"http://10.1.1.0",
|
||||
"http://10.1.1.255",
|
||||
|
@ -102,7 +106,6 @@ URLS_SHOULD_NOT_MATCH = [
|
|||
"http://3628126748",
|
||||
"http://.www.foo.bar/",
|
||||
"http://.www.foo.bar./",
|
||||
"http://10.1.1.1",
|
||||
"NASDAQ:GOOG",
|
||||
"http://-a.b.co",
|
||||
pytest.param("foo.com", marks=pytest.mark.xfail()),
|
||||
|
|
Loading…
Reference in New Issue
Block a user