Revert changes to token_match priority from #4374

* Revert changes to priority of `token_match` so that it has priority
over all other tokenizer patterns

* Add lookahead and potentially slow lookbehind back to the default URL
pattern

* Expand character classes in URL pattern to improve matching around
lookaheads and lookbehinds related to #4882

* Revert changes to Hungarian tokenizer

* Revert (xfail) several URL tests to their status before #4374

* Update `tokenizer.explain()` and docs accordingly
This commit is contained in:
Adriane Boyd 2020-03-09 12:09:41 +01:00
parent 0345135167
commit 1139247532
5 changed files with 38 additions and 20 deletions

View File

@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
_currency = r"\$¢£€¥฿" _currency = r"\$¢£€¥฿"
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")
_units = UNITS.replace("%", "")
_prefixes = ( _prefixes = (
LIST_PUNCT LIST_PUNCT
@ -21,7 +20,8 @@ _prefixes = (
) )
_suffixes = ( _suffixes = (
LIST_PUNCT [r"\+"]
+ LIST_PUNCT
+ LIST_ELLIPSES + LIST_ELLIPSES
+ LIST_QUOTES + LIST_QUOTES
+ [_concat_icons] + [_concat_icons]
@ -29,7 +29,7 @@ _suffixes = (
r"(?<=[0-9])\+", r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.", r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:[{c}])".format(c=_currency), r"(?<=[0-9])(?:[{c}])".format(c=_currency),
r"(?<=[0-9])(?:{u})".format(u=_units), r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{q}(?:{c})])\.".format( r"(?<=[{al}{e}{q}(?:{c})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
), ),

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re import re
from .char_classes import ALPHA_LOWER from .char_classes import ALPHA_LOWER, ALPHA
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
@ -13,6 +13,8 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
URL_PATTERN = ( URL_PATTERN = (
# fmt: off # fmt: off
r"^" r"^"
# in order to support the prefix tokenization (see prefix test cases in test_urls).
r"(?=[" + ALPHA + "\w])"
# protocol identifier (mods: make optional and expand schemes) # protocol identifier (mods: make optional and expand schemes)
# (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
r"(?:(?:[\w\+\-\.]{2,})://)?" r"(?:(?:[\w\+\-\.]{2,})://)?"
@ -54,6 +56,8 @@ URL_PATTERN = (
r"(?::\d{2,5})?" r"(?::\d{2,5})?"
# resource path # resource path
r"(?:[/?#]\S*)?" r"(?:[/?#]\S*)?"
# in order to support the suffix tokenization (see suffix test cases in test_urls),
r"(?<=[" + ALPHA + "\w/])"
r"$" r"$"
# fmt: on # fmt: on
).strip() ).strip()

View File

@ -56,8 +56,12 @@ URLS_SHOULD_MATCH = [
pytest.param( pytest.param(
"chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail() "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
), ),
"http://foo.com/blah_blah_(wikipedia)", pytest.param(
"http://foo.com/blah_blah_(wikipedia)_(again)", "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()
),
pytest.param(
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
),
"http://www.foo.co.uk", "http://www.foo.co.uk",
"http://www.foo.co.uk/", "http://www.foo.co.uk/",
"http://www.foo.co.uk/blah/blah", "http://www.foo.co.uk/blah/blah",

View File

@ -239,6 +239,8 @@ cdef class Tokenizer:
cdef unicode minus_suf cdef unicode minus_suf
cdef size_t last_size = 0 cdef size_t last_size = 0
while string and len(string) != last_size: while string and len(string) != last_size:
if self.token_match and self.token_match(string):
break
if self._specials.get(hash_string(string)) != NULL: if self._specials.get(hash_string(string)) != NULL:
has_special[0] = 1 has_special[0] = 1
break break
@ -455,6 +457,10 @@ cdef class Tokenizer:
suffixes = [] suffixes = []
while substring: while substring:
while prefix_search(substring) or suffix_search(substring): while prefix_search(substring) or suffix_search(substring):
if token_match(substring):
tokens.append(("TOKEN_MATCH", substring))
substring = ''
break
if substring in special_cases: if substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = '' substring = ''
@ -475,12 +481,12 @@ cdef class Tokenizer:
break break
suffixes.append(("SUFFIX", substring[split:])) suffixes.append(("SUFFIX", substring[split:]))
substring = substring[:split] substring = substring[:split]
if substring in special_cases: if token_match(substring):
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
elif token_match(substring):
tokens.append(("TOKEN_MATCH", substring)) tokens.append(("TOKEN_MATCH", substring))
substring = '' substring = ''
elif substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
elif list(infix_finditer(substring)): elif list(infix_finditer(substring)):
infixes = infix_finditer(substring) infixes = infix_finditer(substring)
offset = 0 offset = 0

View File

@ -740,6 +740,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
suffixes = [] suffixes = []
while substring: while substring:
while prefix_search(substring) or suffix_search(substring): while prefix_search(substring) or suffix_search(substring):
if token_match(substring):
tokens.append(substring)
substring = ''
break
if substring in special_cases: if substring in special_cases:
tokens.extend(special_cases[substring]) tokens.extend(special_cases[substring])
substring = '' substring = ''
@ -754,12 +758,12 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
split = suffix_search(substring).start() split = suffix_search(substring).start()
suffixes.append(substring[split:]) suffixes.append(substring[split:])
substring = substring[:split] substring = substring[:split]
if substring in special_cases: if token_match(substring):
tokens.extend(special_cases[substring])
substring = ''
elif token_match(substring):
tokens.append(substring) tokens.append(substring)
substring = '' substring = ''
elif substring in special_cases:
tokens.extend(special_cases[substring])
substring = ''
elif list(infix_finditer(substring)): elif list(infix_finditer(substring)):
infixes = infix_finditer(substring) infixes = infix_finditer(substring)
offset = 0 offset = 0
@ -780,14 +784,14 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
The algorithm can be summarized as follows: The algorithm can be summarized as follows:
1. Iterate over whitespace-separated substrings. 1. Iterate over whitespace-separated substrings.
2. Check whether we have an explicitly defined rule for this substring. If we 2. Look for a token match. If there is a match, stop processing and keep this token.
3. Check whether we have an explicitly defined rule for this substring. If we
do, use it. do, use it.
3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, 4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
so that special cases always get priority. so that the token match and special cases always get priority.
4. If we didn't consume a prefix, try to consume a suffix and then go back to 5. If we didn't consume a prefix, try to consume a suffix and then go back to
#2. #2.
5. If we can't consume a prefix or a suffix, look for a special case. 6. If we can't consume a prefix or a suffix, look for a special case.
6. Next, look for a token match.
7. Look for "infixes" — stuff like hyphens etc. and split the substring into 7. Look for "infixes" — stuff like hyphens etc. and split the substring into
tokens on all infixes. tokens on all infixes.
8. Once we can't consume any more of the string, handle it as a single token. 8. Once we can't consume any more of the string, handle it as a single token.