Improve URL_PATTERN and handling in tokenizer (#4374)

* Move prefix and suffix detection for URL_PATTERN

Move prefix and suffix detection for `URL_PATTERN` into the tokenizer.
Remove associated lookahead and lookbehind from `URL_PATTERN`.

Fix tokenization for Hungarian given new modified handling of prefixes
and suffixes.

* Match a wider range of URI schemes
This commit is contained in:
adrianeboyd 2019-10-05 13:00:09 +02:00 committed by Matthew Honnibal
parent e65dffd80b
commit cbc2cee2c8
4 changed files with 15 additions and 17 deletions

View File

@ -10,10 +10,10 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
_currency = r"\$¢£€¥฿" _currency = r"\$¢£€¥฿"
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")
_units = UNITS.replace("%", "")
_prefixes = ( _prefixes = (
[r"\+"] LIST_PUNCT
+ LIST_PUNCT
+ LIST_ELLIPSES + LIST_ELLIPSES
+ LIST_QUOTES + LIST_QUOTES
+ [_concat_icons] + [_concat_icons]
@ -29,7 +29,7 @@ _suffixes = (
r"(?<=[0-9])\+", r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.", r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:[{c}])".format(c=_currency), r"(?<=[0-9])(?:[{c}])".format(c=_currency),
r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[0-9])(?:{u})".format(u=_units),
r"(?<=[{al}{e}{q}(?:{c})])\.".format( r"(?<=[{al}{e}{q}(?:{c})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
), ),

View File

@ -10,11 +10,9 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
# A few minor mods to this regex to account for use cases represented in test_urls # A few minor mods to this regex to account for use cases represented in test_urls
URL_PATTERN = ( URL_PATTERN = (
r"^" r"^"
# in order to support the prefix tokenization (see prefix test cases in test_urls). # protocol identifier (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
r"(?=[\w])" r"(?:(?:[\w\+\-\.]{2,})://)?"
# protocol identifier # mailto:user or user:pass authentication
r"(?:(?:https?|ftp|mailto)://)?"
# user:pass authentication
r"(?:\S+(?::\S*)?@)?" r"(?:\S+(?::\S*)?@)?"
r"(?:" r"(?:"
# IP address exclusion # IP address exclusion
@ -43,11 +41,7 @@ URL_PATTERN = (
# port number # port number
r"(?::\d{2,5})?" r"(?::\d{2,5})?"
# resource path # resource path
r"(?:/\S*)?" r"(?:[/?#]\S*)?"
# query parameters
r"\??(:?\S*)?"
# in order to support the suffix tokenization (see suffix test cases in test_urls),
r"(?<=[\w/])"
r"$" r"$"
).strip() ).strip()

View File

@ -12,6 +12,7 @@ URLS_BASIC = [
URLS_FULL = URLS_BASIC + [ URLS_FULL = URLS_BASIC + [
"mailto:foo-bar@baz-co.com", "mailto:foo-bar@baz-co.com",
"mailto:foo-bar@baz-co.com?subject=hi",
"www.google.com?q=google", "www.google.com?q=google",
"http://foo.com/blah_(wikipedia)#cite-1", "http://foo.com/blah_(wikipedia)#cite-1",
] ]
@ -45,6 +46,10 @@ URLS_SHOULD_MATCH = [
"http://a.b-c.de", "http://a.b-c.de",
"http://223.255.255.254", "http://223.255.255.254",
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
"ssh://login@server.com:12345/repository.git",
"svn+ssh://user@ssh.yourdomain.com/path",
pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()), pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
pytest.param( pytest.param(
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
@ -81,7 +86,6 @@ URLS_SHOULD_NOT_MATCH = [
"http:// shouldfail.com", "http:// shouldfail.com",
":// should fail", ":// should fail",
"http://foo.bar/foo(bar)baz quux", "http://foo.bar/foo(bar)baz quux",
"ftps://foo.bar/",
"http://-error-.invalid/", "http://-error-.invalid/",
"http://a.b-.co", "http://a.b-.co",
"http://0.0.0.0", "http://0.0.0.0",

View File

@ -227,7 +227,9 @@ cdef class Tokenizer:
cdef unicode minus_suf cdef unicode minus_suf
cdef size_t last_size = 0 cdef size_t last_size = 0
while string and len(string) != last_size: while string and len(string) != last_size:
if self.token_match and self.token_match(string): if self.token_match and self.token_match(string) \
and not self.find_prefix(string) \
and not self.find_suffix(string):
break break
if self._specials.get(hash_string(string)) != NULL: if self._specials.get(hash_string(string)) != NULL:
has_special[0] = 1 has_special[0] = 1
@ -243,8 +245,6 @@ cdef class Tokenizer:
prefixes.push_back(self.vocab.get(mem, prefix)) prefixes.push_back(self.vocab.get(mem, prefix))
has_special[0] = 1 has_special[0] = 1
break break
if self.token_match and self.token_match(string):
break
suf_len = self.find_suffix(string) suf_len = self.find_suffix(string)
if suf_len != 0: if suf_len != 0:
suffix = string[-suf_len:] suffix = string[-suf_len:]