mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Use tokenizer URL_MATCH pattern in LIKE_URL (#8765)
This commit is contained in:
parent
4f28190afe
commit
81d3a1edb1
|
@ -3,6 +3,7 @@ import unicodedata
|
|||
import re
|
||||
|
||||
from .. import attrs
|
||||
from .tokenizer_exceptions import URL_MATCH
|
||||
|
||||
|
||||
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
|
||||
|
@ -109,6 +110,8 @@ def like_url(text: str) -> bool:
|
|||
return True
|
||||
if tld.isalpha() and tld in _tlds:
|
||||
return True
|
||||
if URL_MATCH(text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
|
|
@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match):
|
|||
("www.google.com", True),
|
||||
("google.com", True),
|
||||
("sydney.com", True),
|
||||
("2girls1cup.org", True),
|
||||
("1abc2def.org", True),
|
||||
("http://stupid", True),
|
||||
("www.hi", True),
|
||||
("example.com/example", True),
|
||||
("dog", False),
|
||||
("1.2", False),
|
||||
("1.a", False),
|
||||
|
|
Loading…
Reference in New Issue
Block a user