mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Merge pull request #879 from rappdw/rappdw/tokenizer_exceptions_url_fix
Fix for Issue #840 - URL pattern too broad
This commit is contained in:
		
						commit
						dd13aacc09
					
				|  | @ -2,9 +2,48 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| import re | ||||
| 
 | ||||
| _URL_PATTERN = r''' | ||||
| ^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$ | ||||
| '''.strip() | ||||
| # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex | ||||
| # A few minor mods to this regex to account for use cases represented in test_urls | ||||
| _URL_PATTERN = ( | ||||
|     r"^" | ||||
|     # in order to support the prefix tokenization (see prefix test cases in test_urls). | ||||
|     r"(?=[\w])" | ||||
|     # protocol identifier | ||||
|     r"(?:(?:https?|ftp|mailto)://)?" | ||||
|     # user:pass authentication | ||||
|     r"(?:\S+(?::\S*)?@)?" | ||||
|     r"(?:" | ||||
|     # IP address exclusion | ||||
|     # private & local networks | ||||
|     r"(?!(?:10|127)(?:\.\d{1,3}){3})" | ||||
|     r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" | ||||
|     r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" | ||||
|     # IP address dotted notation octets | ||||
|     # excludes loopback network 0.0.0.0 | ||||
|     # excludes reserved space >= 224.0.0.0 | ||||
|     # excludes network & broadcast addresses | ||||
|     # (first & last IP address of each class) | ||||
|     r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" | ||||
|     r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" | ||||
|     r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" | ||||
|     r"|" | ||||
|     # host name | ||||
|     r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)" | ||||
|     # domain name | ||||
|     r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*" | ||||
|     # TLD identifier | ||||
|     r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" | ||||
|     r")" | ||||
|     # port number | ||||
|     r"(?::\d{2,5})?" | ||||
|     # resource path | ||||
|     r"(?:/\S*)?" | ||||
|     # query parameters | ||||
|     r"\??(:?\S*)?" | ||||
|     # in order to support the suffix tokenization (see suffix test cases in test_urls), | ||||
|     r"(?<=[\w/])" | ||||
|     r"$" | ||||
| ).strip() | ||||
| 
 | ||||
| TOKEN_MATCH = re.compile(_URL_PATTERN).match | ||||
| 
 | ||||
|  |  | |||
|  | @ -41,12 +41,18 @@ def test_tokenizer_handles_digits(tokenizer): | |||
|         assert tokens[3].text == "1984" | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai"]) | ||||
| @pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"]) | ||||
| def test_tokenizer_keep_urls(tokenizer, text): | ||||
|     tokens = tokenizer(text) | ||||
|     assert len(tokens) == 1 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["NASDAQ:GOOG"]) | ||||
| def test_tokenizer_colons(tokenizer, text): | ||||
|     tokens = tokenizer(text) | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) | ||||
| def test_tokenizer_keeps_email(tokenizer, text): | ||||
|     tokens = tokenizer(text) | ||||
|  |  | |||
|  | @ -17,6 +17,88 @@ URLS_FULL = URLS_BASIC + [ | |||
|     "http://foo.com/blah_(wikipedia)#cite-1" | ||||
| ] | ||||
| 
 | ||||
| # URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex | ||||
| URLS_SHOULD_MATCH = [ | ||||
|     "http://foo.com/blah_blah", | ||||
|     "http://foo.com/blah_blah/", | ||||
| #    "http://foo.com/blah_blah_(wikipedia)", | ||||
| #    "http://foo.com/blah_blah_(wikipedia)_(again)", | ||||
|     "http://www.example.com/wpstyle/?p=364", | ||||
|     "https://www.example.com/foo/?bar=baz&inga=42&quux", | ||||
|     "http://✪df.ws/123", | ||||
|     "http://userid:password@example.com:8080", | ||||
|     "http://userid:password@example.com:8080/", | ||||
|     "http://userid@example.com", | ||||
|     "http://userid@example.com/", | ||||
|     "http://userid@example.com:8080", | ||||
|     "http://userid@example.com:8080/", | ||||
|     "http://userid:password@example.com", | ||||
|     "http://userid:password@example.com/", | ||||
|     "http://142.42.1.1/", | ||||
|     "http://142.42.1.1:8080/", | ||||
|     "http://➡.ws/䨹", | ||||
|     "http://⌘.ws", | ||||
|     "http://⌘.ws/", | ||||
|     "http://foo.com/blah_(wikipedia)#cite-1", | ||||
|     "http://foo.com/blah_(wikipedia)_blah#cite-1", | ||||
|     "http://foo.com/unicode_(✪)_in_parens", | ||||
|     "http://foo.com/(something)?after=parens", | ||||
|     "http://☺.damowmow.com/", | ||||
|     "http://code.google.com/events/#&product=browser", | ||||
|     "http://j.mp", | ||||
|     "ftp://foo.bar/baz", | ||||
|     "http://foo.bar/?q=Test%20URL-encoded%20stuff", | ||||
|     "http://مثال.إختبار", | ||||
|     "http://例子.测试", | ||||
| #    "http://उदाहरण.परीक्षा", | ||||
|     "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com", | ||||
|     "http://1337.net", | ||||
|     "http://a.b-c.de", | ||||
|     "http://223.255.255.254", | ||||
| ] | ||||
| 
 | ||||
| URLS_SHOULD_NOT_MATCH = [ | ||||
|     "http://", | ||||
|     "http://.", | ||||
|     "http://..", | ||||
|     "http://../", | ||||
|     "http://?", | ||||
|     "http://??", | ||||
|     "http://??/", | ||||
|     "http://#", | ||||
|     "http://##", | ||||
|     "http://##/", | ||||
|     "http://foo.bar?q=Spaces should be encoded", | ||||
|     "//", | ||||
|     "//a", | ||||
|     "///a", | ||||
|     "///", | ||||
|     "http:///a", | ||||
| #    "foo.com", | ||||
|     "rdar://1234", | ||||
|     "h://test", | ||||
|     "http:// shouldfail.com", | ||||
|     ":// should fail", | ||||
|     "http://foo.bar/foo(bar)baz quux", | ||||
|     "ftps://foo.bar/", | ||||
|     "http://-error-.invalid/", | ||||
| #    "http://a.b--c.de/", (this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 | ||||
|     "http://-a.b.co", | ||||
|     "http://a.b-.co", | ||||
|     "http://0.0.0.0", | ||||
|     "http://10.1.1.0", | ||||
|     "http://10.1.1.255", | ||||
|     "http://224.1.1.1", | ||||
| #    "http://1.1.1.1.1", | ||||
|     "http://123.123.123", | ||||
|     "http://3628126748", | ||||
|     "http://.www.foo.bar/", | ||||
| #    "http://www.foo.bar./", | ||||
|     "http://.www.foo.bar./", | ||||
|     "http://10.1.1.1", | ||||
|     "NASDAQ:GOOG" | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| # Punctuation we want to check is split away before the URL | ||||
| PREFIXES = [ | ||||
|  | @ -28,6 +110,17 @@ PREFIXES = [ | |||
| SUFFIXES = [ | ||||
|     '"', ":", ">"] | ||||
| 
 | ||||
| @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) | ||||
| def test_should_match(en_tokenizer, url): | ||||
|     token_match = en_tokenizer.token_match | ||||
|     if token_match: | ||||
|         assert token_match(url) | ||||
| 
 | ||||
| @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) | ||||
| def test_should_not_match(en_tokenizer, url): | ||||
|     token_match = en_tokenizer.token_match | ||||
|     if token_match: | ||||
|         assert not token_match(url) | ||||
| 
 | ||||
| @pytest.mark.parametrize("url", URLS_BASIC) | ||||
| def test_tokenizer_handles_simple_url(tokenizer, url): | ||||
|  | @ -93,7 +186,7 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url): | |||
| @pytest.mark.parametrize("suffix1", SUFFIXES) | ||||
| @pytest.mark.parametrize("suffix2", SUFFIXES) | ||||
| @pytest.mark.parametrize("url", URLS_FULL) | ||||
| def test_tokenizer_handles_two_prefix_url(tokenizer, suffix1, suffix2, url): | ||||
| def test_tokenizer_handles_two_suffix_url(tokenizer, suffix1, suffix2, url): | ||||
|     tokens = tokenizer(url + suffix1 + suffix2) | ||||
|     assert len(tokens) == 3 | ||||
|     assert tokens[0].text == url | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user