Use tokenizer URL_MATCH pattern in LIKE_URL (#8765)

This commit is contained in:
Adriane Boyd 2021-07-27 12:07:01 +02:00 committed by GitHub
parent 4f28190afe
commit 81d3a1edb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 1 deletions

View File

@ -3,6 +3,7 @@ import unicodedata
import re import re
from .. import attrs from .. import attrs
from .tokenizer_exceptions import URL_MATCH
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match _like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
@ -109,6 +110,8 @@ def like_url(text: str) -> bool:
return True return True
if tld.isalpha() and tld in _tlds: if tld.isalpha() and tld in _tlds:
return True return True
if URL_MATCH(text):
return True
return False return False

View File

@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match):
("www.google.com", True), ("www.google.com", True),
("google.com", True), ("google.com", True),
("sydney.com", True), ("sydney.com", True),
("2girls1cup.org", True), ("1abc2def.org", True),
("http://stupid", True), ("http://stupid", True),
("www.hi", True), ("www.hi", True),
("example.com/example", True),
("dog", False), ("dog", False),
("1.2", False), ("1.2", False),
("1.a", False), ("1.a", False),