mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Use inline flags in token_match patterns (#5257)
* Use inline flags in token_match patterns Use inline flags in `token_match` patterns so that serializing does not lose the flag information. * Modify inline flag * Modify inline flag
This commit is contained in:
parent
e8be15e9b7
commit
c981aa6684
|
@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN)
|
|||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKEN_MATCH = re.compile(
|
||||
"|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE
|
||||
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
|
||||
).match
|
||||
|
|
|
@ -58,7 +58,7 @@ URL_PATTERN = (
|
|||
# fmt: on
|
||||
).strip()
|
||||
|
||||
TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match
|
||||
TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match
|
||||
|
||||
|
||||
BASE_EXCEPTIONS = {}
|
||||
|
|
|
@ -567,7 +567,7 @@ cdef class Tokenizer:
|
|||
))
|
||||
exclude = util.get_serialization_exclude(deserializers, exclude, kwargs)
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
for key in ["prefix_search", "suffix_search", "infix_finditer"]:
|
||||
for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]:
|
||||
if key in data:
|
||||
data[key] = unescape_unicode(data[key])
|
||||
if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
|
||||
|
|
Loading…
Reference in New Issue
Block a user