From 03a0536cbd303abaf3a210f256222c6a11c33719 Mon Sep 17 00:00:00 2001
From: Roman Donchenko <roman@cvat.ai>
Date: Wed, 7 Aug 2024 17:52:29 +0300
Subject: [PATCH] Speed up ProhibitSurrogateCharactersValidator

I've noticed that this validator is using a per-character loop. Replacing it
with a regex results in a pretty significant speedup. Here are results from
my benchmark:

    String length      Old implementation    New implementation
                           time (sec)            time (sec)

              1          2.833e-07             1.765e-07
             10          5.885e-07             2.030e-07
            100          3.598e-06             4.144e-07
           1000          3.329e-05             2.463e-06
          10000          0.0003338             2.449e-05
         100000          0.003338              0.0002284
        1000000          0.03333               0.002278
       10000000          0.3389                0.02377
      100000000          3.250                 0.2365

For large strings, the speedups are more than an order of magnitude.
---
 rest_framework/validators.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/rest_framework/validators.py b/rest_framework/validators.py
index 71ebc2ca9..d0d1d7760 100644
--- a/rest_framework/validators.py
+++ b/rest_framework/validators.py
@@ -6,6 +6,8 @@ This gives us better separation of concerns, allows us to use single-step
 object creation, and makes it possible to switch between using the implicit
 `ModelSerializer` class and an equivalent explicit `Serializer` class.
 """
+import re
+
 from django.db import DataError
 from django.utils.translation import gettext_lazy as _
 
@@ -197,13 +199,14 @@ class UniqueTogetherValidator:
 
 
 class ProhibitSurrogateCharactersValidator:
+    _regex = re.compile(r'[\ud800-\udfff]')
+
     message = _('Surrogate characters are not allowed: U+{code_point:X}.')
     code = 'surrogate_characters_not_allowed'
 
     def __call__(self, value):
-        for surrogate_character in (ch for ch in str(value)
-                                    if 0xD800 <= ord(ch) <= 0xDFFF):
-            message = self.message.format(code_point=ord(surrogate_character))
+        if match := self._regex.search(str(value)):
+            message = self.message.format(code_point=ord(match.group()))
             raise ValidationError(message, code=self.code)
 
     def __eq__(self, other):