diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index dfcb2756e..cb1702300 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN) TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( - "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE + "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp) ).match diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 385afb8bd..29ce75442 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -58,7 +58,7 @@ URL_PATTERN = ( # fmt: on ).strip() -TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match +TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4da081259..62b8bbf4a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -567,7 +567,7 @@ cdef class Tokenizer: )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):