From c981aa66849f0e19688be746f8ecbe344e7578b7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 6 Apr 2020 13:19:04 +0200 Subject: [PATCH] Use inline flags in token_match patterns (#5257) * Use inline flags in token_match patterns Use inline flags in `token_match` patterns so that serializing does not lose the flag information. * Modify inline flag * Modify inline flag --- spacy/lang/fr/tokenizer_exceptions.py | 2 +- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/tokenizer.pyx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index dfcb2756e..cb1702300 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -461,5 +461,5 @@ _regular_exp.append(URL_PATTERN) TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( - "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE + "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp) ).match diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 385afb8bd..29ce75442 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -58,7 +58,7 @@ URL_PATTERN = ( # fmt: on ).strip() -TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match +TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4da081259..62b8bbf4a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -567,7 +567,7 @@ cdef class Tokenizer: )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):