From 03a0536cbd303abaf3a210f256222c6a11c33719 Mon Sep 17 00:00:00 2001 From: Roman Donchenko Date: Wed, 7 Aug 2024 17:52:29 +0300 Subject: [PATCH] Speed up ProhibitSurrogateCharactersValidator I've noticed that this validator is using a per-character loop. Replacing it with a regex results in a pretty significant speedup. Here are results from my benchmark: String length Old implementation New implementation time (sec) time (sec) 1 2.833e-07 1.765e-07 10 5.885e-07 2.030e-07 100 3.598e-06 4.144e-07 1000 3.329e-05 2.463e-06 10000 0.0003338 2.449e-05 100000 0.003338 0.0002284 1000000 0.03333 0.002278 10000000 0.3389 0.02377 100000000 3.250 0.2365 For large strings, the speedups are more than an order of magnitude. --- rest_framework/validators.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/rest_framework/validators.py b/rest_framework/validators.py index 71ebc2ca9..d0d1d7760 100644 --- a/rest_framework/validators.py +++ b/rest_framework/validators.py @@ -6,6 +6,8 @@ This gives us better separation of concerns, allows us to use single-step object creation, and makes it possible to switch between using the implicit `ModelSerializer` class and an equivalent explicit `Serializer` class. """ +import re + from django.db import DataError from django.utils.translation import gettext_lazy as _ @@ -197,13 +199,14 @@ class UniqueTogetherValidator: class ProhibitSurrogateCharactersValidator: + _regex = re.compile(r'[\ud800-\udfff]') + message = _('Surrogate characters are not allowed: U+{code_point:X}.') code = 'surrogate_characters_not_allowed' def __call__(self, value): - for surrogate_character in (ch for ch in str(value) - if 0xD800 <= ord(ch) <= 0xDFFF): - message = self.message.format(code_point=ord(surrogate_character)) + if match := self._regex.search(str(value)): + message = self.message.format(code_point=ord(match.group())) raise ValidationError(message, code=self.code) def __eq__(self, other):