mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Issue #840 - URL pattenr too broad
This commit is contained in:
parent
3c1411226d
commit
3b1df3808d
|
@ -2,9 +2,48 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
_URL_PATTERN = r'''
|
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
||||||
^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$
|
# A few minor mods to this regex to account for use cases represented in test_urls
|
||||||
'''.strip()
|
_URL_PATTERN = (
|
||||||
|
r"^"
|
||||||
|
# in order to support the prefix tokenization (see prefix test cases in test_urls).
|
||||||
|
r"(?=[\w])"
|
||||||
|
# protocol identifier
|
||||||
|
r"(?:(?:https?|ftp|mailto)://)?"
|
||||||
|
# user:pass authentication
|
||||||
|
r"(?:\S+(?::\S*)?@)?"
|
||||||
|
r"(?:"
|
||||||
|
# IP address exclusion
|
||||||
|
# private & local networks
|
||||||
|
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
||||||
|
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
||||||
|
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
||||||
|
# IP address dotted notation octets
|
||||||
|
# excludes loopback network 0.0.0.0
|
||||||
|
# excludes reserved space >= 224.0.0.0
|
||||||
|
# excludes network & broadcast addresses
|
||||||
|
# (first & last IP address of each class)
|
||||||
|
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
|
||||||
|
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
|
||||||
|
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
||||||
|
r"|"
|
||||||
|
# host name
|
||||||
|
r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
|
||||||
|
# domain name
|
||||||
|
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
|
||||||
|
# TLD identifier
|
||||||
|
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
|
||||||
|
r")"
|
||||||
|
# port number
|
||||||
|
r"(?::\d{2,5})?"
|
||||||
|
# resource path
|
||||||
|
r"(?:/\S*)?"
|
||||||
|
# query parameters
|
||||||
|
r"\??(:?\S*)?"
|
||||||
|
# in order to support the suffix tokenization (see suffix test cases in test_urls),
|
||||||
|
r"(?<=[\w/])"
|
||||||
|
r"$"
|
||||||
|
).strip()
|
||||||
|
|
||||||
TOKEN_MATCH = re.compile(_URL_PATTERN).match
|
TOKEN_MATCH = re.compile(_URL_PATTERN).match
|
||||||
|
|
||||||
|
|
|
@ -41,12 +41,18 @@ def test_tokenizer_handles_digits(tokenizer):
|
||||||
assert tokens[3].text == "1984"
|
assert tokens[3].text == "1984"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
|
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai, http://www.google.com"])
|
||||||
def test_tokenizer_keep_urls(tokenizer, text):
|
def test_tokenizer_keep_urls(tokenizer, text):
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["NASDAQ:GOOG"])
|
||||||
|
def test_tokenizer_colons(tokenizer, text):
|
||||||
|
tokens = tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
|
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
|
||||||
def test_tokenizer_keeps_email(tokenizer, text):
|
def test_tokenizer_keeps_email(tokenizer, text):
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
|
|
|
@ -17,6 +17,88 @@ URLS_FULL = URLS_BASIC + [
|
||||||
"http://foo.com/blah_(wikipedia)#cite-1"
|
"http://foo.com/blah_(wikipedia)#cite-1"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex
|
||||||
|
URLS_SHOULD_MATCH = [
|
||||||
|
"http://foo.com/blah_blah",
|
||||||
|
"http://foo.com/blah_blah/",
|
||||||
|
# "http://foo.com/blah_blah_(wikipedia)",
|
||||||
|
# "http://foo.com/blah_blah_(wikipedia)_(again)",
|
||||||
|
"http://www.example.com/wpstyle/?p=364",
|
||||||
|
"https://www.example.com/foo/?bar=baz&inga=42&quux",
|
||||||
|
"http://✪df.ws/123",
|
||||||
|
"http://userid:password@example.com:8080",
|
||||||
|
"http://userid:password@example.com:8080/",
|
||||||
|
"http://userid@example.com",
|
||||||
|
"http://userid@example.com/",
|
||||||
|
"http://userid@example.com:8080",
|
||||||
|
"http://userid@example.com:8080/",
|
||||||
|
"http://userid:password@example.com",
|
||||||
|
"http://userid:password@example.com/",
|
||||||
|
"http://142.42.1.1/",
|
||||||
|
"http://142.42.1.1:8080/",
|
||||||
|
"http://➡.ws/䨹",
|
||||||
|
"http://⌘.ws",
|
||||||
|
"http://⌘.ws/",
|
||||||
|
"http://foo.com/blah_(wikipedia)#cite-1",
|
||||||
|
"http://foo.com/blah_(wikipedia)_blah#cite-1",
|
||||||
|
"http://foo.com/unicode_(✪)_in_parens",
|
||||||
|
"http://foo.com/(something)?after=parens",
|
||||||
|
"http://☺.damowmow.com/",
|
||||||
|
"http://code.google.com/events/#&product=browser",
|
||||||
|
"http://j.mp",
|
||||||
|
"ftp://foo.bar/baz",
|
||||||
|
"http://foo.bar/?q=Test%20URL-encoded%20stuff",
|
||||||
|
"http://مثال.إختبار",
|
||||||
|
"http://例子.测试",
|
||||||
|
# "http://उदाहरण.परीक्षा",
|
||||||
|
"http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com",
|
||||||
|
"http://1337.net",
|
||||||
|
"http://a.b-c.de",
|
||||||
|
"http://223.255.255.254",
|
||||||
|
]
|
||||||
|
|
||||||
|
URLS_SHOULD_NOT_MATCH = [
|
||||||
|
"http://",
|
||||||
|
"http://.",
|
||||||
|
"http://..",
|
||||||
|
"http://../",
|
||||||
|
"http://?",
|
||||||
|
"http://??",
|
||||||
|
"http://??/",
|
||||||
|
"http://#",
|
||||||
|
"http://##",
|
||||||
|
"http://##/",
|
||||||
|
"http://foo.bar?q=Spaces should be encoded",
|
||||||
|
"//",
|
||||||
|
"//a",
|
||||||
|
"///a",
|
||||||
|
"///",
|
||||||
|
"http:///a",
|
||||||
|
# "foo.com",
|
||||||
|
"rdar://1234",
|
||||||
|
"h://test",
|
||||||
|
"http:// shouldfail.com",
|
||||||
|
":// should fail",
|
||||||
|
"http://foo.bar/foo(bar)baz quux",
|
||||||
|
"ftps://foo.bar/",
|
||||||
|
"http://-error-.invalid/",
|
||||||
|
# "http://a.b--c.de/", (this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
||||||
|
"http://-a.b.co",
|
||||||
|
"http://a.b-.co",
|
||||||
|
"http://0.0.0.0",
|
||||||
|
"http://10.1.1.0",
|
||||||
|
"http://10.1.1.255",
|
||||||
|
"http://224.1.1.1",
|
||||||
|
# "http://1.1.1.1.1",
|
||||||
|
"http://123.123.123",
|
||||||
|
"http://3628126748",
|
||||||
|
"http://.www.foo.bar/",
|
||||||
|
# "http://www.foo.bar./",
|
||||||
|
"http://.www.foo.bar./",
|
||||||
|
"http://10.1.1.1",
|
||||||
|
"NASDAQ:GOOG"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# Punctuation we want to check is split away before the URL
|
# Punctuation we want to check is split away before the URL
|
||||||
PREFIXES = [
|
PREFIXES = [
|
||||||
|
@ -28,6 +110,17 @@ PREFIXES = [
|
||||||
SUFFIXES = [
|
SUFFIXES = [
|
||||||
'"', ":", ">"]
|
'"', ":", ">"]
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
|
||||||
|
def test_should_match(en_tokenizer, url):
|
||||||
|
token_match = en_tokenizer.token_match
|
||||||
|
if token_match:
|
||||||
|
assert token_match(url)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
|
||||||
|
def test_should_not_match(en_tokenizer, url):
|
||||||
|
token_match = en_tokenizer.token_match
|
||||||
|
if token_match:
|
||||||
|
assert not token_match(url)
|
||||||
|
|
||||||
@pytest.mark.parametrize("url", URLS_BASIC)
|
@pytest.mark.parametrize("url", URLS_BASIC)
|
||||||
def test_tokenizer_handles_simple_url(tokenizer, url):
|
def test_tokenizer_handles_simple_url(tokenizer, url):
|
||||||
|
@ -93,7 +186,7 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url):
|
||||||
@pytest.mark.parametrize("suffix1", SUFFIXES)
|
@pytest.mark.parametrize("suffix1", SUFFIXES)
|
||||||
@pytest.mark.parametrize("suffix2", SUFFIXES)
|
@pytest.mark.parametrize("suffix2", SUFFIXES)
|
||||||
@pytest.mark.parametrize("url", URLS_FULL)
|
@pytest.mark.parametrize("url", URLS_FULL)
|
||||||
def test_tokenizer_handles_two_prefix_url(tokenizer, suffix1, suffix2, url):
|
def test_tokenizer_handles_two_suffix_url(tokenizer, suffix1, suffix2, url):
|
||||||
tokens = tokenizer(url + suffix1 + suffix2)
|
tokens = tokenizer(url + suffix1 + suffix2)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[0].text == url
|
assert tokens[0].text == url
|
||||||
|
|
Loading…
Reference in New Issue
Block a user