mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge pull request #879 from rappdw/rappdw/tokenizer_exceptions_url_fix
Fix for Issue #840 - URL pattern too broad
This commit is contained in:
commit
dd13aacc09
|
@ -2,9 +2,48 @@ from __future__ import unicode_literals
|
|||
|
||||
import re
|
||||
|
||||
_URL_PATTERN = r'''
|
||||
^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$
|
||||
'''.strip()
|
||||
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
||||
# A few minor mods to this regex to account for use cases represented in test_urls
|
||||
_URL_PATTERN = (
|
||||
r"^"
|
||||
# in order to support the prefix tokenization (see prefix test cases in test_urls).
|
||||
r"(?=[\w])"
|
||||
# protocol identifier
|
||||
r"(?:(?:https?|ftp|mailto)://)?"
|
||||
# user:pass authentication
|
||||
r"(?:\S+(?::\S*)?@)?"
|
||||
r"(?:"
|
||||
# IP address exclusion
|
||||
# private & local networks
|
||||
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
||||
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
||||
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
||||
# IP address dotted notation octets
|
||||
# excludes loopback network 0.0.0.0
|
||||
# excludes reserved space >= 224.0.0.0
|
||||
# excludes network & broadcast addresses
|
||||
# (first & last IP address of each class)
|
||||
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
|
||||
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
|
||||
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
||||
r"|"
|
||||
# host name
|
||||
r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
|
||||
# domain name
|
||||
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
|
||||
# TLD identifier
|
||||
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
|
||||
r")"
|
||||
# port number
|
||||
r"(?::\d{2,5})?"
|
||||
# resource path
|
||||
r"(?:/\S*)?"
|
||||
# query parameters
|
||||
r"\??(:?\S*)?"
|
||||
# in order to support the suffix tokenization (see suffix test cases in test_urls),
|
||||
r"(?<=[\w/])"
|
||||
r"$"
|
||||
).strip()
|
||||
|
||||
TOKEN_MATCH = re.compile(_URL_PATTERN).match
|
||||
|
||||
|
|
|
@ -41,12 +41,18 @@ def test_tokenizer_handles_digits(tokenizer):
|
|||
assert tokens[3].text == "1984"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"])
|
||||
@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"])
|
||||
def test_tokenizer_keep_urls(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["NASDAQ:GOOG"])
|
||||
def test_tokenizer_colons(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"])
|
||||
def test_tokenizer_keeps_email(tokenizer, text):
|
||||
tokens = tokenizer(text)
|
||||
|
|
|
@ -17,6 +17,88 @@ URLS_FULL = URLS_BASIC + [
|
|||
"http://foo.com/blah_(wikipedia)#cite-1"
|
||||
]
|
||||
|
||||
# URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex
|
||||
URLS_SHOULD_MATCH = [
|
||||
"http://foo.com/blah_blah",
|
||||
"http://foo.com/blah_blah/",
|
||||
# "http://foo.com/blah_blah_(wikipedia)",
|
||||
# "http://foo.com/blah_blah_(wikipedia)_(again)",
|
||||
"http://www.example.com/wpstyle/?p=364",
|
||||
"https://www.example.com/foo/?bar=baz&inga=42&quux",
|
||||
"http://✪df.ws/123",
|
||||
"http://userid:password@example.com:8080",
|
||||
"http://userid:password@example.com:8080/",
|
||||
"http://userid@example.com",
|
||||
"http://userid@example.com/",
|
||||
"http://userid@example.com:8080",
|
||||
"http://userid@example.com:8080/",
|
||||
"http://userid:password@example.com",
|
||||
"http://userid:password@example.com/",
|
||||
"http://142.42.1.1/",
|
||||
"http://142.42.1.1:8080/",
|
||||
"http://➡.ws/䨹",
|
||||
"http://⌘.ws",
|
||||
"http://⌘.ws/",
|
||||
"http://foo.com/blah_(wikipedia)#cite-1",
|
||||
"http://foo.com/blah_(wikipedia)_blah#cite-1",
|
||||
"http://foo.com/unicode_(✪)_in_parens",
|
||||
"http://foo.com/(something)?after=parens",
|
||||
"http://☺.damowmow.com/",
|
||||
"http://code.google.com/events/#&product=browser",
|
||||
"http://j.mp",
|
||||
"ftp://foo.bar/baz",
|
||||
"http://foo.bar/?q=Test%20URL-encoded%20stuff",
|
||||
"http://مثال.إختبار",
|
||||
"http://例子.测试",
|
||||
# "http://उदाहरण.परीक्षा",
|
||||
"http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com",
|
||||
"http://1337.net",
|
||||
"http://a.b-c.de",
|
||||
"http://223.255.255.254",
|
||||
]
|
||||
|
||||
URLS_SHOULD_NOT_MATCH = [
|
||||
"http://",
|
||||
"http://.",
|
||||
"http://..",
|
||||
"http://../",
|
||||
"http://?",
|
||||
"http://??",
|
||||
"http://??/",
|
||||
"http://#",
|
||||
"http://##",
|
||||
"http://##/",
|
||||
"http://foo.bar?q=Spaces should be encoded",
|
||||
"//",
|
||||
"//a",
|
||||
"///a",
|
||||
"///",
|
||||
"http:///a",
|
||||
# "foo.com",
|
||||
"rdar://1234",
|
||||
"h://test",
|
||||
"http:// shouldfail.com",
|
||||
":// should fail",
|
||||
"http://foo.bar/foo(bar)baz quux",
|
||||
"ftps://foo.bar/",
|
||||
"http://-error-.invalid/",
|
||||
# "http://a.b--c.de/", (this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
||||
"http://-a.b.co",
|
||||
"http://a.b-.co",
|
||||
"http://0.0.0.0",
|
||||
"http://10.1.1.0",
|
||||
"http://10.1.1.255",
|
||||
"http://224.1.1.1",
|
||||
# "http://1.1.1.1.1",
|
||||
"http://123.123.123",
|
||||
"http://3628126748",
|
||||
"http://.www.foo.bar/",
|
||||
# "http://www.foo.bar./",
|
||||
"http://.www.foo.bar./",
|
||||
"http://10.1.1.1",
|
||||
"NASDAQ:GOOG"
|
||||
]
|
||||
|
||||
|
||||
# Punctuation we want to check is split away before the URL
|
||||
PREFIXES = [
|
||||
|
@ -28,6 +110,17 @@ PREFIXES = [
|
|||
SUFFIXES = [
|
||||
'"', ":", ">"]
|
||||
|
||||
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
|
||||
def test_should_match(en_tokenizer, url):
|
||||
token_match = en_tokenizer.token_match
|
||||
if token_match:
|
||||
assert token_match(url)
|
||||
|
||||
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
|
||||
def test_should_not_match(en_tokenizer, url):
|
||||
token_match = en_tokenizer.token_match
|
||||
if token_match:
|
||||
assert not token_match(url)
|
||||
|
||||
@pytest.mark.parametrize("url", URLS_BASIC)
|
||||
def test_tokenizer_handles_simple_url(tokenizer, url):
|
||||
|
@ -93,7 +186,7 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url):
|
|||
@pytest.mark.parametrize("suffix1", SUFFIXES)
|
||||
@pytest.mark.parametrize("suffix2", SUFFIXES)
|
||||
@pytest.mark.parametrize("url", URLS_FULL)
|
||||
def test_tokenizer_handles_two_prefix_url(tokenizer, suffix1, suffix2, url):
|
||||
def test_tokenizer_handles_two_suffix_url(tokenizer, suffix1, suffix2, url):
|
||||
tokens = tokenizer(url + suffix1 + suffix2)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == url
|
||||
|
|
Loading…
Reference in New Issue
Block a user