Merge pull request #879 from rappdw/rappdw/tokenizer_exceptions_url_fix

Fix for Issue #840 - URL pattern too broad
2026-01-11 03:01:25 +03:00 · 2017-03-09 20:43:11 +01:00 · 2017-03-09 20:43:11 +01:00 · dd13aacc09
commit dd13aacc09
parent dc32e3ecb3 123d3f2d38
3 changed files with 143 additions and 5 deletions
--- a/spacy/language_data/tokenizer_exceptions.py
+++ b/spacy/language_data/tokenizer_exceptions.py
@ -2,9 +2,48 @@ from __future__ import unicode_literals

 import re

-_URL_PATTERN = r'''
-^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$
-'''.strip()
+# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
+# A few minor mods to this regex to account for use cases represented in test_urls
+_URL_PATTERN = (
+    r"^"
+    # in order to support the prefix tokenization (see prefix test cases in test_urls).
+    r"(?=[\w])"
+    # protocol identifier
+    r"(?:(?:https?|ftp|mailto)://)?"
+    # user:pass authentication
+    r"(?:\S+(?::\S*)?@)?"
+    r"(?:"
+    # IP address exclusion
+    # private & local networks
+    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
+    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
+    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
+    # IP address dotted notation octets
+    # excludes loopback network 0.0.0.0
+    # excludes reserved space >= 224.0.0.0
+    # excludes network & broadcast addresses
+    # (first & last IP address of each class)
+    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
+    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
+    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
+    r"|"
+    # host name
+    r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
+    # domain name
+    r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
+    # TLD identifier
+    r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
+    r")"
+    # port number
+    r"(?::\d{2,5})?"
+    # resource path
+    r"(?:/\S*)?"
+    # query parameters
+    r"\??(:?\S*)?"
+    # in order to support the suffix tokenization (see suffix test cases in test_urls),
+    r"(?<=[\w/])"
+    r"$"
+).strip()

 TOKEN_MATCH = re.compile(_URL_PATTERN).match

--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -41,12 +41,18 @@ def test_tokenizer_handles_digits(tokenizer):
        assert tokens[3].text == "1984"


-@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
+@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"])
 def test_tokenizer_keep_urls(tokenizer, text):
    tokens = tokenizer(text)
    assert len(tokens) == 1


+@pytest.mark.parametrize('text', ["NASDAQ:GOOG"])
+def test_tokenizer_colons(tokenizer, text):
+    tokens = tokenizer(text)
+    assert len(tokens) == 3
+
+
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
 def test_tokenizer_keeps_email(tokenizer, text):
    tokens = tokenizer(text)
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@ -17,6 +17,88 @@ URLS_FULL = URLS_BASIC + [
    "http://foo.com/blah_(wikipedia)#cite-1"
 ]

+# URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex
+URLS_SHOULD_MATCH = [
+    "http://foo.com/blah_blah",
+    "http://foo.com/blah_blah/",
+#    "http://foo.com/blah_blah_(wikipedia)",
+#    "http://foo.com/blah_blah_(wikipedia)_(again)",
+    "http://www.example.com/wpstyle/?p=364",
+    "https://www.example.com/foo/?bar=baz&inga=42&quux",
+    "http://✪df.ws/123",
+    "http://userid:password@example.com:8080",
+    "http://userid:password@example.com:8080/",
+    "http://userid@example.com",
+    "http://userid@example.com/",
+    "http://userid@example.com:8080",
+    "http://userid@example.com:8080/",
+    "http://userid:password@example.com",
+    "http://userid:password@example.com/",
+    "http://142.42.1.1/",
+    "http://142.42.1.1:8080/",
+    "http://➡.ws/䨹",
+    "http://⌘.ws",
+    "http://⌘.ws/",
+    "http://foo.com/blah_(wikipedia)#cite-1",
+    "http://foo.com/blah_(wikipedia)_blah#cite-1",
+    "http://foo.com/unicode_(✪)_in_parens",
+    "http://foo.com/(something)?after=parens",
+    "http://☺.damowmow.com/",
+    "http://code.google.com/events/#&product=browser",
+    "http://j.mp",
+    "ftp://foo.bar/baz",
+    "http://foo.bar/?q=Test%20URL-encoded%20stuff",
+    "http://مثال.إختبار",
+    "http://例子.测试",
+#    "http://उदाहरण.परीक्षा",
+    "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com",
+    "http://1337.net",
+    "http://a.b-c.de",
+    "http://223.255.255.254",
+]
+
+URLS_SHOULD_NOT_MATCH = [
+    "http://",
+    "http://.",
+    "http://..",
+    "http://../",
+    "http://?",
+    "http://??",
+    "http://??/",
+    "http://#",
+    "http://##",
+    "http://##/",
+    "http://foo.bar?q=Spaces should be encoded",
+    "//",
+    "//a",
+    "///a",
+    "///",
+    "http:///a",
+#    "foo.com",
+    "rdar://1234",
+    "h://test",
+    "http:// shouldfail.com",
+    ":// should fail",
+    "http://foo.bar/foo(bar)baz quux",
+    "ftps://foo.bar/",
+    "http://-error-.invalid/",
+#    "http://a.b--c.de/", (this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
+    "http://-a.b.co",
+    "http://a.b-.co",
+    "http://0.0.0.0",
+    "http://10.1.1.0",
+    "http://10.1.1.255",
+    "http://224.1.1.1",
+#    "http://1.1.1.1.1",
+    "http://123.123.123",
+    "http://3628126748",
+    "http://.www.foo.bar/",
+#    "http://www.foo.bar./",
+    "http://.www.foo.bar./",
+    "http://10.1.1.1",
+    "NASDAQ:GOOG"
+]
+

 # Punctuation we want to check is split away before the URL
 PREFIXES = [
@ -28,6 +110,17 @@ PREFIXES = [
 SUFFIXES = [
    '"', ":", ">"]

+@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
+def test_should_match(en_tokenizer, url):
+    token_match = en_tokenizer.token_match
+    if token_match:
+        assert token_match(url)
+
+@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
+def test_should_not_match(en_tokenizer, url):
+    token_match = en_tokenizer.token_match
+    if token_match:
+        assert not token_match(url)

@pytest.mark.parametrize("url", URLS_BASIC)
 def test_tokenizer_handles_simple_url(tokenizer, url):
@ -93,7 +186,7 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url):
@pytest.mark.parametrize("suffix1", SUFFIXES)
@pytest.mark.parametrize("suffix2", SUFFIXES)
@pytest.mark.parametrize("url", URLS_FULL)
-def test_tokenizer_handles_two_prefix_url(tokenizer, suffix1, suffix2, url):
+def test_tokenizer_handles_two_suffix_url(tokenizer, suffix1, suffix2, url):
    tokens = tokenizer(url + suffix1 + suffix2)
    assert len(tokens) == 3
    assert tokens[0].text == url