diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py index 6551440f2..7d623cbb3 100644 --- a/spacy/language_data/tokenizer_exceptions.py +++ b/spacy/language_data/tokenizer_exceptions.py @@ -2,9 +2,48 @@ from __future__ import unicode_literals import re -_URL_PATTERN = r''' -^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$ -'''.strip() +# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex +# A few minor mods to this regex to account for use cases represented in test_urls +_URL_PATTERN = ( + r"^" + # in order to support the prefix tokenization (see prefix test cases in test_urls). + r"(?=[\w])" + # protocol identifier + r"(?:(?:https?|ftp|mailto)://)?" + # user:pass authentication + r"(?:\S+(?::\S*)?@)?" + r"(?:" + # IP address exclusion + # private & local networks + r"(?!(?:10|127)(?:\.\d{1,3}){3})" + r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" + r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" + # IP address dotted notation octets + # excludes loopback network 0.0.0.0 + # excludes reserved space >= 224.0.0.0 + # excludes network & broadcast addresses + # (first & last IP address of each class) + r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" + r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" + r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" + r"|" + # host name + r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)" + # domain name + r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*" + # TLD identifier + r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" + r")" + # port number + r"(?::\d{2,5})?" + # resource path + r"(?:/\S*)?" + # query parameters + r"\??(:?\S*)?" + # in order to support the suffix tokenization (see suffix test cases in test_urls), + r"(?<=[\w/])" + r"$" +).strip() TOKEN_MATCH = re.compile(_URL_PATTERN).match diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index a82284b34..822834f58 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -41,12 +41,18 @@ def test_tokenizer_handles_digits(tokenizer): assert tokens[3].text == "1984" -@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"]) +@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai, http://www.google.com"]) def test_tokenizer_keep_urls(tokenizer, text): tokens = tokenizer(text) assert len(tokens) == 1 +@pytest.mark.parametrize('text', ["NASDAQ:GOOG"]) +def test_tokenizer_colons(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + + @pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) def test_tokenizer_keeps_email(tokenizer, text): tokens = tokenizer(text) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 0be714dfe..f4f9ef29e 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -17,6 +17,88 @@ URLS_FULL = URLS_BASIC + [ "http://foo.com/blah_(wikipedia)#cite-1" ] +# URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex +URLS_SHOULD_MATCH = [ + "http://foo.com/blah_blah", + "http://foo.com/blah_blah/", +# "http://foo.com/blah_blah_(wikipedia)", +# "http://foo.com/blah_blah_(wikipedia)_(again)", + "http://www.example.com/wpstyle/?p=364", + "https://www.example.com/foo/?bar=baz&inga=42&quux", + "http://✪df.ws/123", + "http://userid:password@example.com:8080", + "http://userid:password@example.com:8080/", + "http://userid@example.com", + "http://userid@example.com/", + "http://userid@example.com:8080", + "http://userid@example.com:8080/", + "http://userid:password@example.com", + "http://userid:password@example.com/", + "http://142.42.1.1/", + "http://142.42.1.1:8080/", + "http://➡.ws/䨹", + "http://⌘.ws", + "http://⌘.ws/", + "http://foo.com/blah_(wikipedia)#cite-1", + "http://foo.com/blah_(wikipedia)_blah#cite-1", + "http://foo.com/unicode_(✪)_in_parens", + "http://foo.com/(something)?after=parens", + "http://☺.damowmow.com/", + "http://code.google.com/events/#&product=browser", + "http://j.mp", + "ftp://foo.bar/baz", + "http://foo.bar/?q=Test%20URL-encoded%20stuff", + "http://مثال.إختبار", + "http://例子.测试", +# "http://उदाहरण.परीक्षा", + "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com", + "http://1337.net", + "http://a.b-c.de", + "http://223.255.255.254", +] + +URLS_SHOULD_NOT_MATCH = [ + "http://", + "http://.", + "http://..", + "http://../", + "http://?", + "http://??", + "http://??/", + "http://#", + "http://##", + "http://##/", + "http://foo.bar?q=Spaces should be encoded", + "//", + "//a", + "///a", + "///", + "http:///a", +# "foo.com", + "rdar://1234", + "h://test", + "http:// shouldfail.com", + ":// should fail", + "http://foo.bar/foo(bar)baz quux", + "ftps://foo.bar/", + "http://-error-.invalid/", +# "http://a.b--c.de/", (this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 + "http://-a.b.co", + "http://a.b-.co", + "http://0.0.0.0", + "http://10.1.1.0", + "http://10.1.1.255", + "http://224.1.1.1", +# "http://1.1.1.1.1", + "http://123.123.123", + "http://3628126748", + "http://.www.foo.bar/", +# "http://www.foo.bar./", + "http://.www.foo.bar./", + "http://10.1.1.1", + "NASDAQ:GOOG" +] + # Punctuation we want to check is split away before the URL PREFIXES = [ @@ -28,6 +110,17 @@ PREFIXES = [ SUFFIXES = [ '"', ":", ">"] +@pytest.mark.parametrize("url", URLS_SHOULD_MATCH) +def test_should_match(en_tokenizer, url): + token_match = en_tokenizer.token_match + if token_match: + assert token_match(url) + +@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) +def test_should_not_match(en_tokenizer, url): + token_match = en_tokenizer.token_match + if token_match: + assert not token_match(url) @pytest.mark.parametrize("url", URLS_BASIC) def test_tokenizer_handles_simple_url(tokenizer, url): @@ -93,7 +186,7 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url): @pytest.mark.parametrize("suffix1", SUFFIXES) @pytest.mark.parametrize("suffix2", SUFFIXES) @pytest.mark.parametrize("url", URLS_FULL) -def test_tokenizer_handles_two_prefix_url(tokenizer, suffix1, suffix2, url): +def test_tokenizer_handles_two_suffix_url(tokenizer, suffix1, suffix2, url): tokens = tokenizer(url + suffix1 + suffix2) assert len(tokens) == 3 assert tokens[0].text == url