Merge pull request #879 from rappdw/rappdw/tokenizer_exceptions_url_fix

Fix for Issue #840 - URL pattern too broad
This commit is contained in:
Matthew Honnibal 2017-03-09 20:43:11 +01:00 committed by GitHub
commit dd13aacc09
3 changed files with 143 additions and 5 deletions

View File

@ -2,9 +2,48 @@ from __future__ import unicode_literals
import re
_URL_PATTERN = r'''
^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$
'''.strip()
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
# A few minor mods to this regex to account for use cases represented in test_urls
_URL_PATTERN = (
r"^"
# in order to support the prefix tokenization (see prefix test cases in test_urls).
r"(?=[\w])"
# protocol identifier
r"(?:(?:https?|ftp|mailto)://)?"
# user:pass authentication
r"(?:\S+(?::\S*)?@)?"
r"(?:"
# IP address exclusion
# private & local networks
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
r"|"
# host name
r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
# domain name
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
# TLD identifier
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
r")"
# port number
r"(?::\d{2,5})?"
# resource path
r"(?:/\S*)?"
# query parameters
r"\??(:?\S*)?"
# in order to support the suffix tokenization (see suffix test cases in test_urls),
r"(?<=[\w/])"
r"$"
).strip()
TOKEN_MATCH = re.compile(_URL_PATTERN).match

View File

@ -41,12 +41,18 @@ def test_tokenizer_handles_digits(tokenizer):
assert tokens[3].text == "1984"
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"])
def test_tokenizer_keep_urls(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["NASDAQ:GOOG"])
def test_tokenizer_colons(tokenizer, text):
tokens = tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
def test_tokenizer_keeps_email(tokenizer, text):
tokens = tokenizer(text)

View File

@ -17,6 +17,88 @@ URLS_FULL = URLS_BASIC + [
"http://foo.com/blah_(wikipedia)#cite-1"
]
# URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex
URLS_SHOULD_MATCH = [
"http://foo.com/blah_blah",
"http://foo.com/blah_blah/",
# "http://foo.com/blah_blah_(wikipedia)",
# "http://foo.com/blah_blah_(wikipedia)_(again)",
"http://www.example.com/wpstyle/?p=364",
"https://www.example.com/foo/?bar=baz&inga=42&quux",
"http://✪df.ws/123",
"http://userid:password@example.com:8080",
"http://userid:password@example.com:8080/",
"http://userid@example.com",
"http://userid@example.com/",
"http://userid@example.com:8080",
"http://userid@example.com:8080/",
"http://userid:password@example.com",
"http://userid:password@example.com/",
"http://142.42.1.1/",
"http://142.42.1.1:8080/",
"http://➡.ws/䨹",
"http://⌘.ws",
"http://⌘.ws/",
"http://foo.com/blah_(wikipedia)#cite-1",
"http://foo.com/blah_(wikipedia)_blah#cite-1",
"http://foo.com/unicode_(✪)_in_parens",
"http://foo.com/(something)?after=parens",
"http://☺.damowmow.com/",
"http://code.google.com/events/#&product=browser",
"http://j.mp",
"ftp://foo.bar/baz",
"http://foo.bar/?q=Test%20URL-encoded%20stuff",
"http://مثال.إختبار",
"http://例子.测试",
# "http://उदाहरण.परीक्षा",
"http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com",
"http://1337.net",
"http://a.b-c.de",
"http://223.255.255.254",
]
URLS_SHOULD_NOT_MATCH = [
"http://",
"http://.",
"http://..",
"http://../",
"http://?",
"http://??",
"http://??/",
"http://#",
"http://##",
"http://##/",
"http://foo.bar?q=Spaces should be encoded",
"//",
"//a",
"///a",
"///",
"http:///a",
# "foo.com",
"rdar://1234",
"h://test",
"http:// shouldfail.com",
":// should fail",
"http://foo.bar/foo(bar)baz quux",
"ftps://foo.bar/",
"http://-error-.invalid/",
# "http://a.b--c.de/", (this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
"http://-a.b.co",
"http://a.b-.co",
"http://0.0.0.0",
"http://10.1.1.0",
"http://10.1.1.255",
"http://224.1.1.1",
# "http://1.1.1.1.1",
"http://123.123.123",
"http://3628126748",
"http://.www.foo.bar/",
# "http://www.foo.bar./",
"http://.www.foo.bar./",
"http://10.1.1.1",
"NASDAQ:GOOG"
]
# Punctuation we want to check is split away before the URL
PREFIXES = [
@ -28,6 +110,17 @@ PREFIXES = [
SUFFIXES = [
'"', ":", ">"]
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
def test_should_match(en_tokenizer, url):
token_match = en_tokenizer.token_match
if token_match:
assert token_match(url)
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
def test_should_not_match(en_tokenizer, url):
token_match = en_tokenizer.token_match
if token_match:
assert not token_match(url)
@pytest.mark.parametrize("url", URLS_BASIC)
def test_tokenizer_handles_simple_url(tokenizer, url):
@ -93,7 +186,7 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url):
@pytest.mark.parametrize("suffix1", SUFFIXES)
@pytest.mark.parametrize("suffix2", SUFFIXES)
@pytest.mark.parametrize("url", URLS_FULL)
def test_tokenizer_handles_two_prefix_url(tokenizer, suffix1, suffix2, url):
def test_tokenizer_handles_two_suffix_url(tokenizer, suffix1, suffix2, url):
tokens = tokenizer(url + suffix1 + suffix2)
assert len(tokens) == 3
assert tokens[0].text == url