mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Revert changes to token_match priority from #4374
* Revert changes to priority of `token_match` so that it has priority over all other tokenizer patterns * Add lookahead and potentially slow lookbehind back to the default URL pattern * Expand character classes in URL pattern to improve matching around lookaheads and lookbehinds related to #4882 * Revert changes to Hungarian tokenizer * Revert (xfail) several URL tests to their status before #4374 * Update `tokenizer.explain()` and docs accordingly
This commit is contained in:
parent
0345135167
commit
1139247532
|
@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
||||||
|
|
||||||
_currency = r"\$¢£€¥฿"
|
_currency = r"\$¢£€¥฿"
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
_units = UNITS.replace("%", "")
|
|
||||||
|
|
||||||
_prefixes = (
|
_prefixes = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
|
@ -21,7 +20,8 @@ _prefixes = (
|
||||||
)
|
)
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
LIST_PUNCT
|
[r"\+"]
|
||||||
|
+ LIST_PUNCT
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
+ LIST_QUOTES
|
+ LIST_QUOTES
|
||||||
+ [_concat_icons]
|
+ [_concat_icons]
|
||||||
|
@ -29,7 +29,7 @@ _suffixes = (
|
||||||
r"(?<=[0-9])\+",
|
r"(?<=[0-9])\+",
|
||||||
r"(?<=°[FfCcKk])\.",
|
r"(?<=°[FfCcKk])\.",
|
||||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=_units),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||||
),
|
),
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .char_classes import ALPHA_LOWER
|
from .char_classes import ALPHA_LOWER, ALPHA
|
||||||
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
||||||
|
|
||||||
|
|
||||||
|
@ -13,6 +13,8 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
||||||
URL_PATTERN = (
|
URL_PATTERN = (
|
||||||
# fmt: off
|
# fmt: off
|
||||||
r"^"
|
r"^"
|
||||||
|
# in order to support the prefix tokenization (see prefix test cases in test_urls).
|
||||||
|
r"(?=[" + ALPHA + "\w])"
|
||||||
# protocol identifier (mods: make optional and expand schemes)
|
# protocol identifier (mods: make optional and expand schemes)
|
||||||
# (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
|
# (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
|
||||||
r"(?:(?:[\w\+\-\.]{2,})://)?"
|
r"(?:(?:[\w\+\-\.]{2,})://)?"
|
||||||
|
@ -54,6 +56,8 @@ URL_PATTERN = (
|
||||||
r"(?::\d{2,5})?"
|
r"(?::\d{2,5})?"
|
||||||
# resource path
|
# resource path
|
||||||
r"(?:[/?#]\S*)?"
|
r"(?:[/?#]\S*)?"
|
||||||
|
# in order to support the suffix tokenization (see suffix test cases in test_urls),
|
||||||
|
r"(?<=[" + ALPHA + "\w/])"
|
||||||
r"$"
|
r"$"
|
||||||
# fmt: on
|
# fmt: on
|
||||||
).strip()
|
).strip()
|
||||||
|
|
|
@ -56,8 +56,12 @@ URLS_SHOULD_MATCH = [
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
|
"chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
|
||||||
),
|
),
|
||||||
"http://foo.com/blah_blah_(wikipedia)",
|
pytest.param(
|
||||||
"http://foo.com/blah_blah_(wikipedia)_(again)",
|
"http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
|
||||||
|
),
|
||||||
"http://www.foo.co.uk",
|
"http://www.foo.co.uk",
|
||||||
"http://www.foo.co.uk/",
|
"http://www.foo.co.uk/",
|
||||||
"http://www.foo.co.uk/blah/blah",
|
"http://www.foo.co.uk/blah/blah",
|
||||||
|
|
|
@ -239,6 +239,8 @@ cdef class Tokenizer:
|
||||||
cdef unicode minus_suf
|
cdef unicode minus_suf
|
||||||
cdef size_t last_size = 0
|
cdef size_t last_size = 0
|
||||||
while string and len(string) != last_size:
|
while string and len(string) != last_size:
|
||||||
|
if self.token_match and self.token_match(string):
|
||||||
|
break
|
||||||
if self._specials.get(hash_string(string)) != NULL:
|
if self._specials.get(hash_string(string)) != NULL:
|
||||||
has_special[0] = 1
|
has_special[0] = 1
|
||||||
break
|
break
|
||||||
|
@ -455,6 +457,10 @@ cdef class Tokenizer:
|
||||||
suffixes = []
|
suffixes = []
|
||||||
while substring:
|
while substring:
|
||||||
while prefix_search(substring) or suffix_search(substring):
|
while prefix_search(substring) or suffix_search(substring):
|
||||||
|
if token_match(substring):
|
||||||
|
tokens.append(("TOKEN_MATCH", substring))
|
||||||
|
substring = ''
|
||||||
|
break
|
||||||
if substring in special_cases:
|
if substring in special_cases:
|
||||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||||
substring = ''
|
substring = ''
|
||||||
|
@ -475,12 +481,12 @@ cdef class Tokenizer:
|
||||||
break
|
break
|
||||||
suffixes.append(("SUFFIX", substring[split:]))
|
suffixes.append(("SUFFIX", substring[split:]))
|
||||||
substring = substring[:split]
|
substring = substring[:split]
|
||||||
if substring in special_cases:
|
if token_match(substring):
|
||||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
|
||||||
substring = ''
|
|
||||||
elif token_match(substring):
|
|
||||||
tokens.append(("TOKEN_MATCH", substring))
|
tokens.append(("TOKEN_MATCH", substring))
|
||||||
substring = ''
|
substring = ''
|
||||||
|
elif substring in special_cases:
|
||||||
|
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||||
|
substring = ''
|
||||||
elif list(infix_finditer(substring)):
|
elif list(infix_finditer(substring)):
|
||||||
infixes = infix_finditer(substring)
|
infixes = infix_finditer(substring)
|
||||||
offset = 0
|
offset = 0
|
||||||
|
|
|
@ -740,6 +740,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
|
||||||
suffixes = []
|
suffixes = []
|
||||||
while substring:
|
while substring:
|
||||||
while prefix_search(substring) or suffix_search(substring):
|
while prefix_search(substring) or suffix_search(substring):
|
||||||
|
if token_match(substring):
|
||||||
|
tokens.append(substring)
|
||||||
|
substring = ''
|
||||||
|
break
|
||||||
if substring in special_cases:
|
if substring in special_cases:
|
||||||
tokens.extend(special_cases[substring])
|
tokens.extend(special_cases[substring])
|
||||||
substring = ''
|
substring = ''
|
||||||
|
@ -754,12 +758,12 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
|
||||||
split = suffix_search(substring).start()
|
split = suffix_search(substring).start()
|
||||||
suffixes.append(substring[split:])
|
suffixes.append(substring[split:])
|
||||||
substring = substring[:split]
|
substring = substring[:split]
|
||||||
if substring in special_cases:
|
if token_match(substring):
|
||||||
tokens.extend(special_cases[substring])
|
|
||||||
substring = ''
|
|
||||||
elif token_match(substring):
|
|
||||||
tokens.append(substring)
|
tokens.append(substring)
|
||||||
substring = ''
|
substring = ''
|
||||||
|
elif substring in special_cases:
|
||||||
|
tokens.extend(special_cases[substring])
|
||||||
|
substring = ''
|
||||||
elif list(infix_finditer(substring)):
|
elif list(infix_finditer(substring)):
|
||||||
infixes = infix_finditer(substring)
|
infixes = infix_finditer(substring)
|
||||||
offset = 0
|
offset = 0
|
||||||
|
@ -780,14 +784,14 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
|
||||||
The algorithm can be summarized as follows:
|
The algorithm can be summarized as follows:
|
||||||
|
|
||||||
1. Iterate over whitespace-separated substrings.
|
1. Iterate over whitespace-separated substrings.
|
||||||
2. Check whether we have an explicitly defined rule for this substring. If we
|
2. Look for a token match. If there is a match, stop processing and keep this token.
|
||||||
|
3. Check whether we have an explicitly defined rule for this substring. If we
|
||||||
do, use it.
|
do, use it.
|
||||||
3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
|
4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
|
||||||
so that special cases always get priority.
|
so that the token match and special cases always get priority.
|
||||||
4. If we didn't consume a prefix, try to consume a suffix and then go back to
|
5. If we didn't consume a prefix, try to consume a suffix and then go back to
|
||||||
#2.
|
#2.
|
||||||
5. If we can't consume a prefix or a suffix, look for a special case.
|
6. If we can't consume a prefix or a suffix, look for a special case.
|
||||||
6. Next, look for a token match.
|
|
||||||
7. Look for "infixes" — stuff like hyphens etc. and split the substring into
|
7. Look for "infixes" — stuff like hyphens etc. and split the substring into
|
||||||
tokens on all infixes.
|
tokens on all infixes.
|
||||||
8. Once we can't consume any more of the string, handle it as a single token.
|
8. Once we can't consume any more of the string, handle it as a single token.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user