Merge branch 'bytea-parser' into devel

2025-11-17 08:05:28 +03:00 · 2011-03-26 13:02:57 +00:00 · 2011-03-26 13:02:57 +00:00 · 90536a187d
commit 90536a187d
parent f34e44b3f4 da58bee70a
6 changed files with 265 additions and 110 deletions
--- a/2
+++ b/2
@ -1,6 +1,8 @@
 What's new in psycopg 2.4.1
 ---------------------------

+  - Use own parser for bytea output, not requiring anymore the libpq 9.0
+    to parse the hex format.
  - Correctly detect an empty query sent to the backend (ticket #46).


--- a/doc/src/faq.rst
+++ b/doc/src/faq.rst
@ -97,7 +97,9 @@ Psycopg converts :sql:`decimal`\/\ :sql:`numeric` database types into Python `!D
 Transferring binary data from PostgreSQL 9.0 doesn't work.
    PostgreSQL 9.0 uses by default `the "hex" format`__ to transfer
    :sql:`bytea` data: the format can't be parsed by the libpq 8.4 and
-    earlier. Three options to solve the problem are:
+    earlier. The problem is solved in Psycopg 2.4.1, that uses its own parser
+    for the :sql:`bytea` format. For previous Psycopg releases, three options
+    to solve the problem are:

    - set the bytea_output__ parameter to ``escape`` in the server;
    - execute the database command ``SET bytea_output TO escape;`` in the
--- a/doc/src/usage.rst
+++ b/doc/src/usage.rst
@ -271,6 +271,10 @@ the SQL string that would be sent to the database.
  .. versionchanged:: 2.4
     only strings were supported before.

+  .. versionchanged:: 2.4.1
+     can parse the 'hex' format from 9.0 servers without relying on the
+     version of the client library.
+
  .. note::

    In Python 2, if you have binary data in a `!str` object, you can pass them
@ -282,17 +286,14 @@ the SQL string that would be sent to the database.

  .. warning::

-     PostgreSQL 9 uses by default `a new "hex" format`__ to emit :sql:`bytea`
-     fields. Unfortunately this format can't be parsed by libpq versions
-     before 9.0. This means that using a library client with version lesser
-     than 9.0 to talk with a server 9.0 or later you may have problems
-     receiving :sql:`bytea` data. To work around this problem you can set the
-     `bytea_output`__ parameter to ``escape``, either in the server
-     configuration or in the client session using a query such as ``SET
-     bytea_output TO escape;`` before trying to receive binary data.
-     
-     Starting from Psycopg 2.4 this condition is detected and signaled with a
-     `~psycopg2.InterfaceError`.
+     Since version 9.0 PostgreSQL uses by default `a new "hex" format`__ to
+     emit :sql:`bytea` fields. Starting from Psycopg 2.4.1 the format is
+     correctly supported.  If you use a previous version you will need some
+     extra care when receiving bytea from PostgreSQL: you must have at least
+     the libpq 9.0 installed on the client or alternatively you can set the
+     `bytea_output`__ configutation parameter to ``escape``, either in the
+     server configuration file or in the client session (using a query such as
+     ``SET bytea_output TO escape;``) before receiving binary data.
     
     .. __: http://www.postgresql.org/docs/9.0/static/datatype-binary.html
     .. __: http://www.postgresql.org/docs/9.0/static/runtime-config-client.html#GUC-BYTEA-OUTPUT
--- a/psycopg/typecast_binary.c
+++ b/psycopg/typecast_binary.c
@ -40,7 +40,7 @@ chunk_dealloc(chunkObject *self)
        FORMAT_CODE_PY_SSIZE_T,
        self->base, self->len
      );
-    PQfreemem(self->base);
+    PyMem_Free(self->base);
    Py_TYPE(self)->tp_free((PyObject *)self);
 }

@ -127,95 +127,185 @@ PyTypeObject chunkType = {
    chunk_doc                   /* tp_doc */
 };

-static PyObject *
+
+static char *psycopg_parse_hex(
+        const char *bufin, Py_ssize_t sizein, Py_ssize_t *sizeout);
+static char *psycopg_parse_escape(
+        const char *bufin, Py_ssize_t sizein, Py_ssize_t *sizeout);
+
+/* The function is not static and not hidden as we use ctypes to test it. */
+PyObject *
 typecast_BINARY_cast(const char *s, Py_ssize_t l, PyObject *curs)
 {
    chunkObject *chunk = NULL;
    PyObject *res = NULL;
-    char *str = NULL, *buffer = NULL;
-    size_t len;
+    char *buffer = NULL;
+    Py_ssize_t len;

    if (s == NULL) {Py_INCREF(Py_None); return Py_None;}

-    /* PQunescapeBytea absolutely wants a 0-terminated string and we don't
-       want to copy the whole buffer, right? Wrong, but there isn't any other
-       way <g> */
-    if (s[l] != '\0') {
-        if ((buffer = PyMem_Malloc(l+1)) == NULL) {
-            PyErr_NoMemory();
-            goto fail;
+    if (s[0] == '\\' && s[1] == 'x') {
+        /* This is a buffer escaped in hex format: libpq before 9.0 can't
+         * parse it and we can't detect reliably the libpq version at runtime.
+         * So the only robust option is to parse it ourselves - luckily it's
+         * an easy format.
+         */
+        if (NULL == (buffer = psycopg_parse_hex(s, l, &len))) {
+            goto exit;
        }
-        /* Py_ssize_t->size_t cast is safe, as long as the Py_ssize_t is
-         * >= 0: */
-        assert (l >= 0);
-        strncpy(buffer, s, (size_t) l);
-
-        buffer[l] = '\0';
-        s = buffer;
    }
-    str = (char*)PQunescapeBytea((unsigned char*)s, &len);
-    Dprintf("typecast_BINARY_cast: unescaped " FORMAT_CODE_SIZE_T " bytes",
-      len);
-
-    /* The type of the second parameter to PQunescapeBytea is size_t *, so it's
-     * possible (especially with Python < 2.5) to get a return value too large
-     * to fit into a Python container. */
-    if (len > (size_t) PY_SSIZE_T_MAX) {
-      PyErr_SetString(PyExc_IndexError, "PG buffer too large to fit in Python"
-                                        " buffer.");
-      goto fail;
+    else {
+        /* This is a buffer in the classic bytea format. So we can handle it
+         * to the PQunescapeBytea to have it parsed, rignt? ...Wrong. We
+         * could, but then we'd have to record whether buffer was allocated by
+         * Python or by the libpq to dispose it properly. Furthermore the
+         * PQunescapeBytea interface is not the most brilliant as it wants a
+         * null-terminated string even if we have known its length thus
+         * requiring a useless memcpy and strlen.
+         * So we'll just have our better integrated parser, let's finish this
+         * story.
+         */
+        if (NULL == (buffer = psycopg_parse_escape(s, l, &len))) {
+            goto exit;
        }
-
-    /* Check the escaping was successful */
-    if (s[0] == '\\' && s[1] == 'x'     /* input encoded in hex format */
-        && str[0] == 'x'                /* output resulted in an 'x' */
-        && s[2] != '7' && s[3] != '8')  /* input wasn't really an x (0x78) */
-    {
-        PyErr_SetString(InterfaceError,
-            "can't receive bytea data from server >= 9.0 with the current "
-            "libpq client library: please update the libpq to at least 9.0 "
-            "or set bytea_output to 'escape' in the server config "
-            "or with a query");
-        goto fail;
    }

    chunk = (chunkObject *) PyObject_New(chunkObject, &chunkType);
-    if (chunk == NULL) goto fail;
+    if (chunk == NULL) goto exit;

-    /* **Transfer** ownership of str's memory to the chunkObject: */
-    chunk->base = str;
-    str = NULL;
+    /* **Transfer** ownership of buffer's memory to the chunkObject: */
+    chunk->base = buffer;
+    buffer = NULL;
+    chunk->len = (Py_ssize_t)len;

-    /* size_t->Py_ssize_t cast was validated above: */
-    chunk->len = (Py_ssize_t) len;
 #if PY_MAJOR_VERSION < 3
    if ((res = PyBuffer_FromObject((PyObject *)chunk, 0, chunk->len)) == NULL)
-        goto fail;
+        goto exit;
 #else
    if ((res = PyMemoryView_FromObject((PyObject*)chunk)) == NULL)
-        goto fail;
+        goto exit;
 #endif
-    /* PyBuffer_FromObject() created a new reference.  We'll release our
-     * reference held in 'chunk' in the 'cleanup' clause. */

-    goto cleanup;
-    fail:
-      assert (PyErr_Occurred());
-      if (res != NULL) {
-          Py_DECREF(res);
-          res = NULL;
-      }
-      /* Fall through to cleanup: */
-    cleanup:
-      if (chunk != NULL) {
-          Py_DECREF((PyObject *) chunk);
-      }
-      if (str != NULL) {
-          /* str's mem was allocated by PQunescapeBytea; must use PQfreemem: */
-          PQfreemem(str);
-      }
-      /* We allocated buffer with PyMem_Malloc; must use PyMem_Free: */
+exit:
+    Py_XDECREF((PyObject *)chunk);
    PyMem_Free(buffer);

    return res;
 }
+
+
+static const char hex_lut[128] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,
+    -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+/* Parse a bytea output buffer encoded in 'hex' format.
+ *
+ * the format is described in
+ * http://www.postgresql.org/docs/9.0/static/datatype-binary.html
+ *
+ * Parse the buffer in 'bufin', whose length is 'sizein'.
+ * Return a new buffer allocated by PyMem_Malloc and set 'sizeout' to its size.
+ * In case of error set an exception and return NULL.
+ */
+static char *
+psycopg_parse_hex(const char *bufin, Py_ssize_t sizein, Py_ssize_t *sizeout)
+{
+    char *ret = NULL;
+    const char *bufend = bufin + sizein;
+    const char *pi = bufin + 2;     /* past the \x */
+    char *bufout;
+    char *po;
+
+    po = bufout = PyMem_Malloc((sizein - 2) >> 1);   /* output size upper bound */
+    if (NULL == bufout) {
+        PyErr_NoMemory();
+        goto exit;
+    }
+
+    /* Implementation note: we call this function upon database response, not
+     * user input (because we are parsing the output format of a buffer) so we
+     * don't expect errors. On bad input we reserve the right to return a bad
+     * output, not an error.
+     */
+    while (pi < bufend) {
+        char c;
+        while (-1 == (c = hex_lut[*pi++ & '\x7f'])) {
+            if (pi >= bufend) { goto endloop; }
+        }
+        *po = c << 4;
+
+        while (-1 == (c = hex_lut[*pi++ & '\x7f'])) {
+            if (pi >= bufend) { goto endloop; }
+        }
+        *po++ |= c;
+    }
+endloop:
+
+    ret = bufout;
+    *sizeout = po - bufout;
+
+exit:
+    return ret;
+}
+
+/* Parse a bytea output buffer encoded in 'escape' format.
+ *
+ * the format is described in
+ * http://www.postgresql.org/docs/9.0/static/datatype-binary.html
+ *
+ * Parse the buffer in 'bufin', whose length is 'sizein'.
+ * Return a new buffer allocated by PyMem_Malloc and set 'sizeout' to its size.
+ * In case of error set an exception and return NULL.
+ */
+static char *
+psycopg_parse_escape(const char *bufin, Py_ssize_t sizein, Py_ssize_t *sizeout)
+{
+    char *ret = NULL;
+    const char *bufend = bufin + sizein;
+    const char *pi = bufin;
+    char *bufout;
+    char *po;
+
+    po = bufout = PyMem_Malloc(sizein);   /* output size upper bound */
+    if (NULL == bufout) {
+        PyErr_NoMemory();
+        goto exit;
+    }
+
+    while (pi < bufend) {
+        if (*pi != '\\') {
+            /* Unescaped char */
+            *po++ = *pi++;
+            continue;
+        }
+        if ((pi[1] >= '0' && pi[1] <= '3') &&
+            (pi[2] >= '0' && pi[2] <= '7') &&
+            (pi[3] >= '0' && pi[3] <= '7'))
+        {
+            /* Escaped octal value */
+            *po++ = ((pi[1] - '0') << 6) |
+                    ((pi[2] - '0') << 3) |
+                    ((pi[3] - '0'));
+            pi += 4;
+        }
+        else {
+            /* Escaped char */
+            *po++ = pi[1];
+            pi += 2;
+        }
+    }
+
+    ret = bufout;
+    *sizeout = po - bufout;
+
+exit:
+    return ret;
+}
+
--- a/tests/testutils.py
+++ b/tests/testutils.py
@ -140,24 +140,6 @@ def skip_if_no_namedtuple(f):
    return skip_if_no_namedtuple_


-def skip_if_broken_hex_binary(f):
-    """Decorator to detect libpq < 9.0 unable to parse bytea in hex format"""
-    def cope_with_hex_binary_(self):
-        from psycopg2 import InterfaceError
-        try:
-            return f(self)
-        except InterfaceError, e:
-            if '9.0' in str(e) and self.conn.server_version >= 90000:
-                return self.skipTest(
-                    # FIXME: we are only assuming the libpq is older here,
-                    # but we don't have a reliable way to detect the libpq
-                    # version, not pre-9 at least.
-                    "bytea broken with server >= 9.0, libpq < 9")
-            else:
-                raise
-
-    return cope_with_hex_binary_
-
 def skip_if_no_iobase(f):
    """Skip a test if io.TextIOBase is not available."""
    def skip_if_no_iobase_(self):
--- a/tests/types_basic.py
+++ b/tests/types_basic.py
@ -28,7 +28,7 @@ except:
    pass
 import sys
 import testutils
-from testutils import unittest, skip_if_broken_hex_binary
+from testutils import unittest
 from testconfig import dsn

 import psycopg2
@ -116,7 +116,6 @@ class TypesBasicTests(unittest.TestCase):
        s = self.execute("SELECT %s AS foo", (float("-inf"),))
        self.failUnless(str(s) == "-inf", "wrong float quoting: " + str(s))      

-    @skip_if_broken_hex_binary
    def testBinary(self):
        if sys.version_info[0] < 3:
            s = ''.join([chr(x) for x in range(256)])
@ -143,7 +142,6 @@ class TypesBasicTests(unittest.TestCase):
            b = psycopg2.Binary(bytes([]))
            self.assertEqual(str(b), "''::bytea")

-    @skip_if_broken_hex_binary
    def testBinaryRoundTrip(self):
        # test to make sure buffers returned by psycopg2 are
        # understood by execute:
@ -191,7 +189,6 @@ class TypesBasicTests(unittest.TestCase):
        s = self.execute("SELECT '{}'::text AS foo")
        self.failUnlessEqual(s, "{}")

-    @skip_if_broken_hex_binary
    @testutils.skip_from_python(3)
    def testTypeRoundtripBuffer(self):
        o1 = buffer("".join(map(chr, range(256))))
@ -204,7 +201,6 @@ class TypesBasicTests(unittest.TestCase):
        self.assertEqual(type(o1), type(o2))
        self.assertEqual(str(o1), str(o2))

-    @skip_if_broken_hex_binary
    @testutils.skip_from_python(3)
    def testTypeRoundtripBufferArray(self):
        o1 = buffer("".join(map(chr, range(256))))
@ -213,7 +209,6 @@ class TypesBasicTests(unittest.TestCase):
        self.assertEqual(type(o1[0]), type(o2[0]))
        self.assertEqual(str(o1[0]), str(o2[0]))

-    @skip_if_broken_hex_binary
    @testutils.skip_before_python(3)
    def testTypeRoundtripBytes(self):
        o1 = bytes(range(256))
@ -225,7 +220,6 @@ class TypesBasicTests(unittest.TestCase):
        o2 = self.execute("select %s;", (o1,))
        self.assertEqual(memoryview, type(o2))

-    @skip_if_broken_hex_binary
    @testutils.skip_before_python(3)
    def testTypeRoundtripBytesArray(self):
        o1 = bytes(range(256))
@ -233,7 +227,6 @@ class TypesBasicTests(unittest.TestCase):
        o2 = self.execute("select %s;", (o1,))
        self.assertEqual(memoryview, type(o2[0]))

-    @skip_if_broken_hex_binary
    @testutils.skip_before_python(2, 6)
    def testAdaptBytearray(self):
        o1 = bytearray(range(256))
@ -258,7 +251,6 @@ class TypesBasicTests(unittest.TestCase):
        else:
            self.assertEqual(memoryview, type(o2))

-    @skip_if_broken_hex_binary
    @testutils.skip_before_python(2, 7)
    def testAdaptMemoryview(self):
        o1 = memoryview(bytearray(range(256)))
@ -335,6 +327,92 @@ class AdaptSubclassTest(unittest.TestCase):
           del psycopg2.extensions.adapters[A, psycopg2.extensions.ISQLQuote]


+class ByteaParserTest(unittest.TestCase):
+    """Unit test for our bytea format parser."""
+    def setUp(self):
+        try:
+            self._cast = self._import_cast()
+        except Exception, e:
+            return self.skipTest("can't test bytea parser: %s - %s"
+                % (e.__class__.__name__, e))
+
+    def _import_cast(self):
+        """Use ctypes to access the C function.
+
+        Raise any sort of error: we just support this where ctypes works as
+        expected.
+        """
+        import ctypes
+        lib = ctypes.cdll.LoadLibrary(psycopg2._psycopg.__file__)
+        cast = lib.typecast_BINARY_cast
+        cast.argtypes = [ctypes.c_char_p, ctypes.c_size_t, ctypes.py_object]
+        cast.restype = ctypes.py_object
+        return cast
+
+    def cast(self, buffer):
+        """Cast a buffer from the output format"""
+        l = buffer and len(buffer) or 0
+        rv = self._cast(buffer, l, None)
+
+        if rv is None:
+            return None
+
+        if sys.version_info[0] < 3:
+            return str(rv)
+        else:
+            return rv.tobytes()
+
+    def test_null(self):
+        rv = self.cast(None)
+        self.assertEqual(rv, None)
+
+    def test_blank(self):
+        rv = self.cast(b(''))
+        self.assertEqual(rv, b(''))
+
+    def test_blank_hex(self):
+        # Reported as problematic in ticket #48
+        rv = self.cast(b('\\x'))
+        self.assertEqual(rv, b(''))
+
+    def test_full_hex(self, upper=False):
+        buf = ''.join(("%02x" % i) for i in range(256))
+        if upper: buf = buf.upper()
+        buf = '\\x' + buf
+        rv = self.cast(b(buf))
+        if sys.version_info[0] < 3:
+            self.assertEqual(rv, ''.join(map(chr, range(256))))
+        else:
+            self.assertEqual(rv, bytes(range(256)))
+
+    def test_full_hex_upper(self):
+        return self.test_full_hex(upper=True)
+
+    def test_full_escaped_octal(self):
+        buf = ''.join(("\\%03o" % i) for i in range(256))
+        rv = self.cast(b(buf))
+        if sys.version_info[0] < 3:
+            self.assertEqual(rv, ''.join(map(chr, range(256))))
+        else:
+            self.assertEqual(rv, bytes(range(256)))
+
+    def test_escaped_mixed(self):
+        import string
+        buf = ''.join(("\\%03o" % i) for i in range(32))
+        buf += string.ascii_letters
+        buf += ''.join('\\' + c for c in string.ascii_letters)
+        buf += '\\\\'
+        rv = self.cast(b(buf))
+        if sys.version_info[0] < 3:
+            tgt = ''.join(map(chr, range(32))) \
+                + string.ascii_letters * 2 + '\\'
+        else:
+            tgt = bytes(range(32)) + \
+                (string.ascii_letters * 2 + '\\').encode('ascii')
+
+        self.assertEqual(rv, tgt)
+
+
 def test_suite():
    return unittest.TestLoader().loadTestsFromName(__name__)