mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Use tokenizer URL_MATCH pattern in LIKE_URL (#8765)
This commit is contained in:
parent
4f28190afe
commit
81d3a1edb1
|
@ -3,6 +3,7 @@ import unicodedata
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .. import attrs
|
from .. import attrs
|
||||||
|
from .tokenizer_exceptions import URL_MATCH
|
||||||
|
|
||||||
|
|
||||||
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
|
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
|
||||||
|
@ -109,6 +110,8 @@ def like_url(text: str) -> bool:
|
||||||
return True
|
return True
|
||||||
if tld.isalpha() and tld in _tlds:
|
if tld.isalpha() and tld in _tlds:
|
||||||
return True
|
return True
|
||||||
|
if URL_MATCH(text):
|
||||||
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match):
|
||||||
("www.google.com", True),
|
("www.google.com", True),
|
||||||
("google.com", True),
|
("google.com", True),
|
||||||
("sydney.com", True),
|
("sydney.com", True),
|
||||||
("2girls1cup.org", True),
|
("1abc2def.org", True),
|
||||||
("http://stupid", True),
|
("http://stupid", True),
|
||||||
("www.hi", True),
|
("www.hi", True),
|
||||||
|
("example.com/example", True),
|
||||||
("dog", False),
|
("dog", False),
|
||||||
("1.2", False),
|
("1.2", False),
|
||||||
("1.a", False),
|
("1.a", False),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user