mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 17:22:25 +03:00
Merge pull request #5121 from adrianeboyd/bugfix/revert-token-match
Revert token_match priority changes from #4374 and extend token match options
This commit is contained in:
commit
f6078d866a
|
@ -4,7 +4,6 @@ from __future__ import unicode_literals
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .punctuation import ELISION, HYPHENS
|
from .punctuation import ELISION, HYPHENS
|
||||||
from ..tokenizer_exceptions import URL_PATTERN
|
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA
|
from ..char_classes import ALPHA_LOWER, ALPHA
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
|
@ -455,9 +454,6 @@ _regular_exp += [
|
||||||
for hc in _hyphen_combination
|
for hc in _hyphen_combination
|
||||||
]
|
]
|
||||||
|
|
||||||
# URLs
|
|
||||||
_regular_exp.append(URL_PATTERN)
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
TOKEN_MATCH = re.compile(
|
TOKEN_MATCH = re.compile(
|
||||||
|
|
|
@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
||||||
|
|
||||||
_currency = r"\$¢£€¥฿"
|
_currency = r"\$¢£€¥฿"
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
_units = UNITS.replace("%", "")
|
|
||||||
|
|
||||||
_prefixes = (
|
_prefixes = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
|
@ -21,7 +20,8 @@ _prefixes = (
|
||||||
)
|
)
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
LIST_PUNCT
|
[r"\+"]
|
||||||
|
+ LIST_PUNCT
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
+ LIST_QUOTES
|
+ LIST_QUOTES
|
||||||
+ [_concat_icons]
|
+ [_concat_icons]
|
||||||
|
@ -29,7 +29,7 @@ _suffixes = (
|
||||||
r"(?<=[0-9])\+",
|
r"(?<=[0-9])\+",
|
||||||
r"(?<=°[FfCcKk])\.",
|
r"(?<=°[FfCcKk])\.",
|
||||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=_units),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||||
),
|
),
|
||||||
|
|
|
@ -4,7 +4,6 @@ from __future__ import unicode_literals
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ..punctuation import ALPHA_LOWER, CURRENCY
|
from ..punctuation import ALPHA_LOWER, CURRENCY
|
||||||
from ..tokenizer_exceptions import URL_PATTERN
|
|
||||||
from ...symbols import ORTH
|
from ...symbols import ORTH
|
||||||
|
|
||||||
|
|
||||||
|
@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match
|
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .char_classes import ALPHA_LOWER
|
from .char_classes import ALPHA_LOWER, ALPHA
|
||||||
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
|
||||||
|
|
||||||
|
|
||||||
|
@ -58,7 +58,8 @@ URL_PATTERN = (
|
||||||
# fmt: on
|
# fmt: on
|
||||||
).strip()
|
).strip()
|
||||||
|
|
||||||
TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match
|
TOKEN_MATCH = None
|
||||||
|
URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
|
||||||
|
|
||||||
|
|
||||||
BASE_EXCEPTIONS = {}
|
BASE_EXCEPTIONS = {}
|
||||||
|
|
|
@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer
|
||||||
from .attrs import IS_STOP, LANG, NORM
|
from .attrs import IS_STOP, LANG, NORM
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .lang.punctuation import TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
|
||||||
from .lang.norm_exceptions import BASE_NORMS
|
from .lang.norm_exceptions import BASE_NORMS
|
||||||
from .lang.tag_map import TAG_MAP
|
from .lang.tag_map import TAG_MAP
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
|
@ -89,6 +89,7 @@ class BaseDefaults(object):
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
rules = cls.tokenizer_exceptions
|
rules = cls.tokenizer_exceptions
|
||||||
token_match = cls.token_match
|
token_match = cls.token_match
|
||||||
|
url_match = cls.url_match
|
||||||
prefix_search = (
|
prefix_search = (
|
||||||
util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
|
util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
|
||||||
)
|
)
|
||||||
|
@ -106,10 +107,12 @@ class BaseDefaults(object):
|
||||||
suffix_search=suffix_search,
|
suffix_search=suffix_search,
|
||||||
infix_finditer=infix_finditer,
|
infix_finditer=infix_finditer,
|
||||||
token_match=token_match,
|
token_match=token_match,
|
||||||
|
url_match=url_match,
|
||||||
)
|
)
|
||||||
|
|
||||||
pipe_names = ["tagger", "parser", "ner"]
|
pipe_names = ["tagger", "parser", "ner"]
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
url_match = URL_MATCH
|
||||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
|
|
|
@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"]
|
||||||
|
|
||||||
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
|
@pytest.mark.parametrize("url", URLS_SHOULD_MATCH)
|
||||||
def test_should_match(en_tokenizer, url):
|
def test_should_match(en_tokenizer, url):
|
||||||
assert en_tokenizer.token_match(url) is not None
|
assert en_tokenizer.url_match(url) is not None
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
|
@pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH)
|
||||||
def test_should_not_match(en_tokenizer, url):
|
def test_should_not_match(en_tokenizer, url):
|
||||||
assert en_tokenizer.token_match(url) is None
|
assert en_tokenizer.url_match(url) is None
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("url", URLS_BASIC)
|
@pytest.mark.parametrize("url", URLS_BASIC)
|
||||||
|
|
|
@ -17,6 +17,7 @@ cdef class Tokenizer:
|
||||||
cpdef readonly Vocab vocab
|
cpdef readonly Vocab vocab
|
||||||
|
|
||||||
cdef object _token_match
|
cdef object _token_match
|
||||||
|
cdef object _url_match
|
||||||
cdef object _prefix_search
|
cdef object _prefix_search
|
||||||
cdef object _suffix_search
|
cdef object _suffix_search
|
||||||
cdef object _infix_finditer
|
cdef object _infix_finditer
|
||||||
|
|
|
@ -30,7 +30,8 @@ cdef class Tokenizer:
|
||||||
DOCS: https://spacy.io/api/tokenizer
|
DOCS: https://spacy.io/api/tokenizer
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||||
suffix_search=None, infix_finditer=None, token_match=None):
|
suffix_search=None, infix_finditer=None, token_match=None,
|
||||||
|
url_match=None):
|
||||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||||
|
|
||||||
vocab (Vocab): A storage container for lexical types.
|
vocab (Vocab): A storage container for lexical types.
|
||||||
|
@ -43,6 +44,8 @@ cdef class Tokenizer:
|
||||||
`re.compile(string).finditer` to find infixes.
|
`re.compile(string).finditer` to find infixes.
|
||||||
token_match (callable): A boolean function matching strings to be
|
token_match (callable): A boolean function matching strings to be
|
||||||
recognised as tokens.
|
recognised as tokens.
|
||||||
|
url_match (callable): A boolean function matching strings to be
|
||||||
|
recognised as tokens after considering prefixes and suffixes.
|
||||||
RETURNS (Tokenizer): The newly constructed object.
|
RETURNS (Tokenizer): The newly constructed object.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
|
@ -55,6 +58,7 @@ cdef class Tokenizer:
|
||||||
self._cache = PreshMap()
|
self._cache = PreshMap()
|
||||||
self._specials = PreshMap()
|
self._specials = PreshMap()
|
||||||
self.token_match = token_match
|
self.token_match = token_match
|
||||||
|
self.url_match = url_match
|
||||||
self.prefix_search = prefix_search
|
self.prefix_search = prefix_search
|
||||||
self.suffix_search = suffix_search
|
self.suffix_search = suffix_search
|
||||||
self.infix_finditer = infix_finditer
|
self.infix_finditer = infix_finditer
|
||||||
|
@ -70,6 +74,14 @@ cdef class Tokenizer:
|
||||||
self._token_match = token_match
|
self._token_match = token_match
|
||||||
self._flush_cache()
|
self._flush_cache()
|
||||||
|
|
||||||
|
property url_match:
|
||||||
|
def __get__(self):
|
||||||
|
return self._url_match
|
||||||
|
|
||||||
|
def __set__(self, url_match):
|
||||||
|
self._url_match = url_match
|
||||||
|
self._flush_cache()
|
||||||
|
|
||||||
property prefix_search:
|
property prefix_search:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._prefix_search
|
return self._prefix_search
|
||||||
|
@ -108,11 +120,12 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
args = (self.vocab,
|
args = (self.vocab,
|
||||||
self._rules,
|
self.rules,
|
||||||
self.prefix_search,
|
self.prefix_search,
|
||||||
self.suffix_search,
|
self.suffix_search,
|
||||||
self.infix_finditer,
|
self.infix_finditer,
|
||||||
self.token_match)
|
self.token_match,
|
||||||
|
self.url_match)
|
||||||
return (self.__class__, args, None, None)
|
return (self.__class__, args, None, None)
|
||||||
|
|
||||||
cpdef Doc tokens_from_list(self, list strings):
|
cpdef Doc tokens_from_list(self, list strings):
|
||||||
|
@ -240,6 +253,8 @@ cdef class Tokenizer:
|
||||||
cdef unicode minus_suf
|
cdef unicode minus_suf
|
||||||
cdef size_t last_size = 0
|
cdef size_t last_size = 0
|
||||||
while string and len(string) != last_size:
|
while string and len(string) != last_size:
|
||||||
|
if self.token_match and self.token_match(string):
|
||||||
|
break
|
||||||
if self._specials.get(hash_string(string)) != NULL:
|
if self._specials.get(hash_string(string)) != NULL:
|
||||||
has_special[0] = 1
|
has_special[0] = 1
|
||||||
break
|
break
|
||||||
|
@ -295,7 +310,9 @@ cdef class Tokenizer:
|
||||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||||
if cache_hit:
|
if cache_hit:
|
||||||
pass
|
pass
|
||||||
elif self.token_match and self.token_match(string):
|
elif (self.token_match and self.token_match(string)) or \
|
||||||
|
(self.url_match and \
|
||||||
|
self.url_match(string)):
|
||||||
# We're always saying 'no' to spaces here -- the caller will
|
# We're always saying 'no' to spaces here -- the caller will
|
||||||
# fix up the outermost one, with reference to the original.
|
# fix up the outermost one, with reference to the original.
|
||||||
# See Issue #859
|
# See Issue #859
|
||||||
|
@ -448,6 +465,11 @@ cdef class Tokenizer:
|
||||||
suffix_search = self.suffix_search
|
suffix_search = self.suffix_search
|
||||||
infix_finditer = self.infix_finditer
|
infix_finditer = self.infix_finditer
|
||||||
token_match = self.token_match
|
token_match = self.token_match
|
||||||
|
if token_match is None:
|
||||||
|
token_match = re.compile("a^").match
|
||||||
|
url_match = self.url_match
|
||||||
|
if url_match is None:
|
||||||
|
url_match = re.compile("a^").match
|
||||||
special_cases = {}
|
special_cases = {}
|
||||||
for orth, special_tokens in self.rules.items():
|
for orth, special_tokens in self.rules.items():
|
||||||
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
|
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
|
||||||
|
@ -456,6 +478,10 @@ cdef class Tokenizer:
|
||||||
suffixes = []
|
suffixes = []
|
||||||
while substring:
|
while substring:
|
||||||
while prefix_search(substring) or suffix_search(substring):
|
while prefix_search(substring) or suffix_search(substring):
|
||||||
|
if token_match(substring):
|
||||||
|
tokens.append(("TOKEN_MATCH", substring))
|
||||||
|
substring = ''
|
||||||
|
break
|
||||||
if substring in special_cases:
|
if substring in special_cases:
|
||||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||||
substring = ''
|
substring = ''
|
||||||
|
@ -476,12 +502,15 @@ cdef class Tokenizer:
|
||||||
break
|
break
|
||||||
suffixes.append(("SUFFIX", substring[split:]))
|
suffixes.append(("SUFFIX", substring[split:]))
|
||||||
substring = substring[:split]
|
substring = substring[:split]
|
||||||
if substring in special_cases:
|
if token_match(substring):
|
||||||
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
|
||||||
substring = ''
|
|
||||||
elif token_match(substring):
|
|
||||||
tokens.append(("TOKEN_MATCH", substring))
|
tokens.append(("TOKEN_MATCH", substring))
|
||||||
substring = ''
|
substring = ''
|
||||||
|
elif url_match(substring):
|
||||||
|
tokens.append(("URL_MATCH", substring))
|
||||||
|
substring = ''
|
||||||
|
elif substring in special_cases:
|
||||||
|
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
|
||||||
|
substring = ''
|
||||||
elif list(infix_finditer(substring)):
|
elif list(infix_finditer(substring)):
|
||||||
infixes = infix_finditer(substring)
|
infixes = infix_finditer(substring)
|
||||||
offset = 0
|
offset = 0
|
||||||
|
@ -543,6 +572,7 @@ cdef class Tokenizer:
|
||||||
("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
|
("suffix_search", lambda: _get_regex_pattern(self.suffix_search)),
|
||||||
("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
|
("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)),
|
||||||
("token_match", lambda: _get_regex_pattern(self.token_match)),
|
("token_match", lambda: _get_regex_pattern(self.token_match)),
|
||||||
|
("url_match", lambda: _get_regex_pattern(self.url_match)),
|
||||||
("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
|
("exceptions", lambda: OrderedDict(sorted(self._rules.items())))
|
||||||
))
|
))
|
||||||
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
exclude = util.get_serialization_exclude(serializers, exclude, kwargs)
|
||||||
|
@ -564,11 +594,12 @@ cdef class Tokenizer:
|
||||||
("suffix_search", lambda b: data.setdefault("suffix_search", b)),
|
("suffix_search", lambda b: data.setdefault("suffix_search", b)),
|
||||||
("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
|
("infix_finditer", lambda b: data.setdefault("infix_finditer", b)),
|
||||||
("token_match", lambda b: data.setdefault("token_match", b)),
|
("token_match", lambda b: data.setdefault("token_match", b)),
|
||||||
|
("url_match", lambda b: data.setdefault("url_match", b)),
|
||||||
("exceptions", lambda b: data.setdefault("rules", b))
|
("exceptions", lambda b: data.setdefault("rules", b))
|
||||||
))
|
))
|
||||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]:
|
for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]:
|
||||||
if key in data:
|
if key in data:
|
||||||
data[key] = unescape_unicode(data[key])
|
data[key] = unescape_unicode(data[key])
|
||||||
if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
|
if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
|
||||||
|
@ -579,6 +610,8 @@ cdef class Tokenizer:
|
||||||
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
||||||
if "token_match" in data and isinstance(data["token_match"], basestring_):
|
if "token_match" in data and isinstance(data["token_match"], basestring_):
|
||||||
self.token_match = re.compile(data["token_match"]).match
|
self.token_match = re.compile(data["token_match"]).match
|
||||||
|
if "url_match" in data and isinstance(data["url_match"], basestring_):
|
||||||
|
self.url_match = re.compile(data["url_match"]).match
|
||||||
if "rules" in data and isinstance(data["rules"], dict):
|
if "rules" in data and isinstance(data["rules"], dict):
|
||||||
# make sure to hard reset the cache to remove data from the default exceptions
|
# make sure to hard reset the cache to remove data from the default exceptions
|
||||||
self._rules = {}
|
self._rules = {}
|
||||||
|
|
|
@ -35,14 +35,15 @@ the
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||||
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
||||||
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
||||||
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
||||||
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
||||||
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
|
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
|
||||||
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
|
||||||
|
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
||||||
|
|
||||||
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -738,6 +738,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
|
||||||
suffixes = []
|
suffixes = []
|
||||||
while substring:
|
while substring:
|
||||||
while prefix_search(substring) or suffix_search(substring):
|
while prefix_search(substring) or suffix_search(substring):
|
||||||
|
if token_match(substring):
|
||||||
|
tokens.append(substring)
|
||||||
|
substring = ''
|
||||||
|
break
|
||||||
if substring in special_cases:
|
if substring in special_cases:
|
||||||
tokens.extend(special_cases[substring])
|
tokens.extend(special_cases[substring])
|
||||||
substring = ''
|
substring = ''
|
||||||
|
@ -752,12 +756,15 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
|
||||||
split = suffix_search(substring).start()
|
split = suffix_search(substring).start()
|
||||||
suffixes.append(substring[split:])
|
suffixes.append(substring[split:])
|
||||||
substring = substring[:split]
|
substring = substring[:split]
|
||||||
if substring in special_cases:
|
if token_match(substring):
|
||||||
tokens.extend(special_cases[substring])
|
|
||||||
substring = ''
|
|
||||||
elif token_match(substring):
|
|
||||||
tokens.append(substring)
|
tokens.append(substring)
|
||||||
substring = ''
|
substring = ''
|
||||||
|
elif url_match(substring):
|
||||||
|
tokens.append(substring)
|
||||||
|
substring = ''
|
||||||
|
elif substring in special_cases:
|
||||||
|
tokens.extend(special_cases[substring])
|
||||||
|
substring = ''
|
||||||
elif list(infix_finditer(substring)):
|
elif list(infix_finditer(substring)):
|
||||||
infixes = infix_finditer(substring)
|
infixes = infix_finditer(substring)
|
||||||
offset = 0
|
offset = 0
|
||||||
|
@ -778,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
|
||||||
The algorithm can be summarized as follows:
|
The algorithm can be summarized as follows:
|
||||||
|
|
||||||
1. Iterate over whitespace-separated substrings.
|
1. Iterate over whitespace-separated substrings.
|
||||||
2. Check whether we have an explicitly defined rule for this substring. If we
|
2. Look for a token match. If there is a match, stop processing and keep this
|
||||||
do, use it.
|
token.
|
||||||
3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
|
3. Check whether we have an explicitly defined special case for this substring.
|
||||||
so that special cases always get priority.
|
If we do, use it.
|
||||||
4. If we didn't consume a prefix, try to consume a suffix and then go back to
|
4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to
|
||||||
|
#2, so that the token match and special cases always get priority.
|
||||||
|
5. If we didn't consume a prefix, try to consume a suffix and then go back to
|
||||||
#2.
|
#2.
|
||||||
5. If we can't consume a prefix or a suffix, look for a special case.
|
6. If we can't consume a prefix or a suffix, look for a URL match.
|
||||||
6. Next, look for a token match.
|
7. If there's no URL match, then look for a special case.
|
||||||
7. Look for "infixes" — stuff like hyphens etc. and split the substring into
|
8. Look for "infixes" — stuff like hyphens etc. and split the substring into
|
||||||
tokens on all infixes.
|
tokens on all infixes.
|
||||||
8. Once we can't consume any more of the string, handle it as a single token.
|
9. Once we can't consume any more of the string, handle it as a single token.
|
||||||
|
|
||||||
#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
|
#### Debugging the tokenizer {#tokenizer-debug new="2.2.3"}
|
||||||
|
|
||||||
|
@ -832,8 +841,8 @@ domain. There are five things you would need to define:
|
||||||
hyphens etc.
|
hyphens etc.
|
||||||
5. An optional boolean function `token_match` matching strings that should never
|
5. An optional boolean function `token_match` matching strings that should never
|
||||||
be split, overriding the infix rules. Useful for things like URLs or numbers.
|
be split, overriding the infix rules. Useful for things like URLs or numbers.
|
||||||
Note that prefixes and suffixes will be split off before `token_match` is
|
6. An optional boolean function `url_match`, which is similar to `token_match`
|
||||||
applied.
|
except prefixes and suffixes are removed before applying the match.
|
||||||
|
|
||||||
You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
|
You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
|
||||||
to use `re.compile()` to build a regular expression object, and pass its
|
to use `re.compile()` to build a regular expression object, and pass its
|
||||||
|
|
Loading…
Reference in New Issue
Block a user