mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Revert changes to token_match priority from #4374
* Revert changes to priority of `token_match` so that it has priority over all other tokenizer patterns * Add lookahead and potentially slow lookbehind back to the default URL pattern * Expand character classes in URL pattern to improve matching around lookaheads and lookbehinds related to #4882 * Revert changes to Hungarian tokenizer * Revert (xfail) several URL tests to their status before #4374 * Update `tokenizer.explain()` and docs accordingly
This commit is contained in:
parent
0345135167
commit
1139247532
|
@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
|||
|
||||
_currency = r"\$¢£€¥฿"
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
_units = UNITS.replace("%", "")
|
||||
|
||||
_prefixes = (
|
||||
LIST_PUNCT
|
||||
|
@ -21,7 +20,8 @@ _prefixes = (
|
|||
)
|
||||
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
[r"\+"]
|
||||
+ LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ [_concat_icons]
|
||||
|
@ -29,7 +29,7 @@ _suffixes = (
|
|||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||
r"(?<=[0-9])(?:{u})".format(u=_units),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||
),
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
import re
|
||||
|
||||
from .char_classes import ALPHA_LOWER
|
||||
from .char_classes import ALPHA_LOWER, ALPHA
|
||||
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
||||
|
||||
|
||||
|
@ -13,6 +13,8 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
|||
URL_PATTERN = (
|
||||
# fmt: off
|
||||
r"^"
|
||||
# in order to support the prefix tokenization (see prefix test cases in test_urls).
|
||||
r"(?=[" + ALPHA + "\w])"
|
||||
# protocol identifier (mods: make optional and expand schemes)
|
||||
# (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
|
||||
r"(?:(?:[\w\+\-\.]{2,})://)?"
|
||||
|
@ -54,6 +56,8 @@ URL_PATTERN = (
|
|||
r"(?::\d{2,5})?"
|
||||
# resource path
|
||||
r"(?:[/?#]\S*)?"
|
||||
# in order to support the suffix tokenization (see suffix test cases in test_urls),
|
||||
r"(?<=[" + ALPHA + "\w/])"
|
||||
r"$"
|
||||
# fmt: on
|
||||
).strip()
|
||||
|
|
|
@ -56,8 +56,12 @@ URLS_SHOULD_MATCH = [
|
|||
pytest.param(
|
||||
"chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
|
||||
),
|
||||
"http://foo.com/blah_blah_(wikipedia)",
|
||||
"http://foo.com/blah_blah_(wikipedia)_(again)",
|
||||
pytest.param(
|
||||
"http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()
|
||||
),
|
||||
pytest.param(
|
||||
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
|
||||
),
|
||||
"http://www.foo.co.uk",
|
||||
"http://www.foo.co.uk/",
|
||||
"http://www.foo.co.uk/blah/blah",
|
||||
|
|
|
@ -239,6 +239,8 @@ cdef class Tokenizer:
|
|||
cdef unicode minus_suf
|
||||
cdef size_t last_size = 0
|
||||
while string and len(string) != last_size:
|
||||
if self.token_match and self.token_match(string):
|
||||
break
|
||||
if self._specials.get(hash_string(string)) != NULL:
|
||||
has_special[0] = 1
|
||||
break
|
||||
|
@ -455,6 +457,10 @@ cdef class Tokenizer:
|
|||
suffixes = []
|
||||
while substring:
|
||||
while prefix_search(substring) or suffix_search(substring):
|
||||
if token_match(substring):
|
||||
tokens.append(("TOKEN_MATCH", substring))
|
||||
substring = ''
|
||||
break
|
||||
if substring in special_cases:
|
||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||
substring = ''
|
||||
|
@ -475,12 +481,12 @@ cdef class Tokenizer:
|
|||
break
|
||||
suffixes.append(("SUFFIX", substring[split:]))
|
||||
substring = substring[:split]
|
||||
if substring in special_cases:
|
||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||
substring = ''
|
||||
elif token_match(substring):
|
||||
if token_match(substring):
|
||||
tokens.append(("TOKEN_MATCH", substring))
|
||||
substring = ''
|
||||
elif substring in special_cases:
|
||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||
substring = ''
|
||||
elif list(infix_finditer(substring)):
|
||||
infixes = infix_finditer(substring)
|
||||
offset = 0
|
||||
|
|
|
@ -740,6 +740,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
|
|||
suffixes = []
|
||||
while substring:
|
||||
while prefix_search(substring) or suffix_search(substring):
|
||||
if token_match(substring):
|
||||
tokens.append(substring)
|
||||
substring = ''
|
||||
break
|
||||
if substring in special_cases:
|
||||
tokens.extend(special_cases[substring])
|
||||
substring = ''
|
||||
|
@ -754,12 +758,12 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
|
|||
split = suffix_search(substring).start()
|
||||
suffixes.append(substring[split:])
|
||||
substring = substring[:split]
|
||||
if substring in special_cases:
|
||||
tokens.extend(special_cases[substring])
|
||||
substring = ''
|
||||
elif token_match(substring):
|
||||
if token_match(substring):
|
||||
tokens.append(substring)
|
||||
substring = ''
|
||||
elif substring in special_cases:
|
||||
tokens.extend(special_cases[substring])
|
||||
substring = ''
|
||||
elif list(infix_finditer(substring)):
|
||||
infixes = infix_finditer(substring)
|
||||
offset = 0
|
||||
|
@ -780,14 +784,14 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
|
|||
The algorithm can be summarized as follows:
|
||||
|
||||
1. Iterate over whitespace-separated substrings.
|
||||
2. Check whether we have an explicitly defined rule for this substring. If we
|
||||
2. Look for a token match. If there is a match, stop processing and keep this token.
|
||||
3. Check whether we have an explicitly defined rule for this substring. If we
|
||||
do, use it.
|
||||
3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
|
||||
so that special cases always get priority.
|
||||
4. If we didn't consume a prefix, try to consume a suffix and then go back to
|
||||
4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
|
||||
so that the token match and special cases always get priority.
|
||||
5. If we didn't consume a prefix, try to consume a suffix and then go back to
|
||||
#2.
|
||||
5. If we can't consume a prefix or a suffix, look for a special case.
|
||||
6. Next, look for a token match.
|
||||
6. If we can't consume a prefix or a suffix, look for a special case.
|
||||
7. Look for "infixes" — stuff like hyphens etc. and split the substring into
|
||||
tokens on all infixes.
|
||||
8. Once we can't consume any more of the string, handle it as a single token.
|
||||
|
|
Loading…
Reference in New Issue
Block a user