From 121cf3b8f8426765d983579d3a4b2e932429cd9f Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Wed, 12 Oct 2016 01:10:31 +0100 Subject: [PATCH 1/8] Optimize UTF8 and Latin1 decoding Cache a pointer to a fast decoding function when the connection encoding is set so skip a repeated codec lookup for every string. --- psycopg/connection.h | 3 +++ psycopg/connection_int.c | 29 +++++++++++++++++++++++++++++ psycopg/typecast_basic.c | 11 ++++++++--- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/psycopg/connection.h b/psycopg/connection.h index ec107429..b925bd47 100644 --- a/psycopg/connection.h +++ b/psycopg/connection.h @@ -122,6 +122,9 @@ struct connectionObject { int autocommit; PyObject *cursor_factory; /* default cursor factory from cursor() */ + + /* Pointer to a decoding function, e.g. PyUnicode_DecodeUTF8 */ + PyObject *(*cdecoder)(const char *, Py_ssize_t, const char *); }; /* map isolation level values into a numeric const */ diff --git a/psycopg/connection_int.c b/psycopg/connection_int.c index 43d0fdae..62976d48 100644 --- a/psycopg/connection_int.c +++ b/psycopg/connection_int.c @@ -361,6 +361,31 @@ exit: return rv; } + +/* set fast access functions according to the currently selected codec + */ +void +conn_set_fast_codec(connectionObject *self) +{ + Dprintf("conn_set_fast_codec: codec=%s", self->codec); + + if (0 == strcmp(self->codec, "utf_8")) { + Dprintf("conn_set_fast_codec: PyUnicode_DecodeUTF8"); + self->cdecoder = PyUnicode_DecodeUTF8; + return; + } + + if (0 == strcmp(self->codec, "iso8859_1")) { + Dprintf("conn_set_fast_codec: PyUnicode_DecodeLatin1"); + self->cdecoder = PyUnicode_DecodeLatin1; + return; + } + + Dprintf("conn_set_fast_codec: no fast codec"); + self->cdecoder = NULL; +} + + /* Read the client encoding from the connection. * * Store the encoding in the pgconn->encoding field and the name of the @@ -402,6 +427,8 @@ conn_read_encoding(connectionObject *self, PGconn *pgconn) self->codec = codec; codec = NULL; + conn_set_fast_codec(self); + rv = 0; exit: @@ -1243,6 +1270,8 @@ conn_set_client_encoding(connectionObject *self, const char *enc) codec = NULL; } + conn_set_fast_codec(self); + Dprintf("conn_set_client_encoding: set encoding to %s (codec: %s)", self->encoding, self->codec); diff --git a/psycopg/typecast_basic.c b/psycopg/typecast_basic.c index a31047f3..760555ef 100644 --- a/psycopg/typecast_basic.c +++ b/psycopg/typecast_basic.c @@ -93,12 +93,17 @@ typecast_STRING_cast(const char *s, Py_ssize_t len, PyObject *curs) static PyObject * typecast_UNICODE_cast(const char *s, Py_ssize_t len, PyObject *curs) { - char *enc; + connectionObject *conn; if (s == NULL) { Py_RETURN_NONE; } - enc = ((cursorObject*)curs)->conn->codec; - return PyUnicode_Decode(s, len, enc, NULL); + conn = ((cursorObject*)curs)->conn; + if (conn->cdecoder) { + return conn->cdecoder(s, len, NULL); + } + else { + return PyUnicode_Decode(s, len, conn->codec, NULL); + } } /** BOOLEAN - cast boolean value into right python object **/ From f439ca61d678ed2fe34c132580cd6e8a581819f7 Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Mon, 26 Dec 2016 12:25:13 +0100 Subject: [PATCH 2/8] conn->codec rename to pyenc --- doc/src/extensions.rst | 2 +- doc/src/usage.rst | 2 +- psycopg/adapter_qstring.c | 2 +- psycopg/connection.h | 2 +- psycopg/connection_int.c | 90 +++++++++++++++++++-------------------- psycopg/connection_type.c | 2 +- psycopg/cursor_type.c | 2 +- psycopg/error.h | 2 +- psycopg/error_type.c | 4 +- psycopg/lobject_type.c | 4 +- psycopg/microprotocols.c | 6 +-- psycopg/pqpath.c | 12 +++--- psycopg/typecast.c | 2 +- psycopg/typecast_basic.c | 2 +- 14 files changed, 67 insertions(+), 67 deletions(-) diff --git a/doc/src/extensions.rst b/doc/src/extensions.rst index b661895d..9c5a8538 100644 --- a/doc/src/extensions.rst +++ b/doc/src/extensions.rst @@ -417,7 +417,7 @@ details. .. data:: encodings - Mapping from `PostgreSQL encoding`__ names to `Python codec`__ names. + Mapping from `PostgreSQL encoding`__ to `Python encoding`__ names. Used by Psycopg when adapting or casting unicode strings. See :ref:`unicode-handling`. diff --git a/doc/src/usage.rst b/doc/src/usage.rst index e768f372..d9fea755 100644 --- a/doc/src/usage.rst +++ b/doc/src/usage.rst @@ -355,7 +355,7 @@ Unicode handling Psycopg can exchange Unicode data with a PostgreSQL database. Python `!unicode` objects are automatically *encoded* in the client encoding defined on the database connection (the `PostgreSQL encoding`__, available in -`connection.encoding`, is translated into a `Python codec`__ using the +`connection.encoding`, is translated into a `Python encoding`__ using the `~psycopg2.extensions.encodings` mapping):: >>> print u, type(u) diff --git a/psycopg/adapter_qstring.c b/psycopg/adapter_qstring.c index 8c5a8f10..eca42182 100644 --- a/psycopg/adapter_qstring.c +++ b/psycopg/adapter_qstring.c @@ -43,7 +43,7 @@ _qstring_get_encoding(qstringObject *self) conn->encoding but if the encoding is not specified we don't know what to do and we raise an exception */ if (self->conn) { - return self->conn->codec; + return self->conn->pyenc; } else { return self->encoding ? self->encoding : default_encoding; diff --git a/psycopg/connection.h b/psycopg/connection.h index b925bd47..d108b71f 100644 --- a/psycopg/connection.h +++ b/psycopg/connection.h @@ -83,7 +83,7 @@ struct connectionObject { char *dsn; /* data source name */ char *critical; /* critical error on this connection */ char *encoding; /* current backend encoding */ - char *codec; /* python codec name for encoding */ + char *pyenc; /* connection encoding python name */ long int closed; /* 1 means connection has been closed; 2 that something horrible happened */ diff --git a/psycopg/connection_int.c b/psycopg/connection_int.c index a63b47e9..a180460a 100644 --- a/psycopg/connection_int.c +++ b/psycopg/connection_int.c @@ -61,8 +61,8 @@ conn_text_from_chars(connectionObject *self, const char *str) #if PY_MAJOR_VERSION < 3 return PyString_FromString(str); #else - const char *codec = self ? self->codec : "ascii"; - return PyUnicode_Decode(str, strlen(str), codec, "replace"); + const char *pyenc = self ? self->pyenc : "ascii"; + return PyUnicode_Decode(str, strlen(str), pyenc, "replace"); #endif } @@ -321,43 +321,43 @@ exit: return rv; } -/* Convert a PostgreSQL encoding to a Python codec. +/* Convert a PostgreSQL encoding name to a Python encoding name. * - * Set 'codec' to a new copy of the codec name allocated on the Python heap. + * Set 'pyenc' to a new copy of the encoding name allocated on the Python heap. * Return 0 in case of success, else -1 and set an exception. * - * 'enc' should be already normalized (uppercase, no - or _). + * 'pgenc' should be already normalized (uppercase, no - or _). */ RAISES_NEG static int -conn_encoding_to_codec(const char *enc, char **codec) +conn_pgenc_to_pyenc(const char *pgenc, char **pyenc) { char *tmp; Py_ssize_t size; - PyObject *pyenc = NULL; + PyObject *opyenc = NULL; int rv = -1; - /* Find the Py codec name from the PG encoding */ - if (!(pyenc = PyDict_GetItemString(psycoEncodings, enc))) { + /* Find the Py encoding name from the PG encoding */ + if (!(opyenc = PyDict_GetItemString(psycoEncodings, pgenc))) { PyErr_Format(OperationalError, - "no Python codec for client encoding '%s'", enc); + "no Python encoding for PostgreSQL encoding '%s'", pgenc); goto exit; } - /* Convert the codec in a bytes string to extract the c string. */ - Py_INCREF(pyenc); - if (!(pyenc = psycopg_ensure_bytes(pyenc))) { + /* Convert the encoding in a bytes string to extract the c string. */ + Py_INCREF(opyenc); + if (!(opyenc = psycopg_ensure_bytes(opyenc))) { goto exit; } - if (-1 == Bytes_AsStringAndSize(pyenc, &tmp, &size)) { + if (-1 == Bytes_AsStringAndSize(opyenc, &tmp, &size)) { goto exit; } - /* have our own copy of the python codec name */ - rv = psycopg_strdup(codec, tmp, size); + /* have our own copy of the python encoding name */ + rv = psycopg_strdup(pyenc, tmp, size); exit: - Py_XDECREF(pyenc); + Py_XDECREF(opyenc); return rv; } @@ -367,15 +367,15 @@ exit: void conn_set_fast_codec(connectionObject *self) { - Dprintf("conn_set_fast_codec: codec=%s", self->codec); + Dprintf("conn_set_fast_codec: encoding=%s", self->pyenc); - if (0 == strcmp(self->codec, "utf_8")) { + if (0 == strcmp(self->pyenc, "utf_8")) { Dprintf("conn_set_fast_codec: PyUnicode_DecodeUTF8"); self->cdecoder = PyUnicode_DecodeUTF8; return; } - if (0 == strcmp(self->codec, "iso8859_1")) { + if (0 == strcmp(self->pyenc, "iso8859_1")) { Dprintf("conn_set_fast_codec: PyUnicode_DecodeLatin1"); self->cdecoder = PyUnicode_DecodeLatin1; return; @@ -389,7 +389,7 @@ conn_set_fast_codec(connectionObject *self) /* Read the client encoding from the connection. * * Store the encoding in the pgconn->encoding field and the name of the - * matching python codec in codec. The buffers are allocated on the Python + * matching python encoding in pyenc. The buffers are allocated on the Python * heap. * * Return 0 on success, else nonzero. @@ -397,7 +397,7 @@ conn_set_fast_codec(connectionObject *self) RAISES_NEG static int conn_read_encoding(connectionObject *self, PGconn *pgconn) { - char *enc = NULL, *codec = NULL; + char *pgenc = NULL, *pyenc = NULL; const char *tmp; int rv = -1; @@ -409,31 +409,31 @@ conn_read_encoding(connectionObject *self, PGconn *pgconn) goto exit; } - if (0 > clear_encoding_name(tmp, &enc)) { + if (0 > clear_encoding_name(tmp, &pgenc)) { goto exit; } /* Look for this encoding in Python codecs. */ - if (0 > conn_encoding_to_codec(enc, &codec)) { + if (0 > conn_pgenc_to_pyenc(pgenc, &pyenc)) { goto exit; } - /* Good, success: store the encoding/codec in the connection. */ + /* Good, success: store the encoding/pyenc in the connection. */ PyMem_Free(self->encoding); - self->encoding = enc; - enc = NULL; + self->encoding = pgenc; + pgenc = NULL; - PyMem_Free(self->codec); - self->codec = codec; - codec = NULL; + PyMem_Free(self->pyenc); + self->pyenc = pyenc; + pyenc = NULL; conn_set_fast_codec(self); rv = 0; exit: - PyMem_Free(enc); - PyMem_Free(codec); + PyMem_Free(pgenc); + PyMem_Free(pyenc); return rv; } @@ -1252,21 +1252,21 @@ endlock: /* conn_set_client_encoding - switch client encoding on connection */ RAISES_NEG int -conn_set_client_encoding(connectionObject *self, const char *enc) +conn_set_client_encoding(connectionObject *self, const char *pgenc) { PGresult *pgres = NULL; char *error = NULL; int res = -1; - char *codec = NULL; + char *pyenc = NULL; char *clean_enc = NULL; /* If the current encoding is equal to the requested one we don't issue any query to the backend */ - if (strcmp(self->encoding, enc) == 0) return 0; + if (strcmp(self->encoding, pgenc) == 0) return 0; - /* We must know what python codec this encoding is. */ - if (0 > clear_encoding_name(enc, &clean_enc)) { goto exit; } - if (0 > conn_encoding_to_codec(clean_enc, &codec)) { goto exit; } + /* We must know what python encoding this encoding is. */ + if (0 > clear_encoding_name(pgenc, &clean_enc)) { goto exit; } + if (0 > conn_pgenc_to_pyenc(clean_enc, &pyenc)) { goto exit; } Py_BEGIN_ALLOW_THREADS; pthread_mutex_lock(&self->lock); @@ -1290,18 +1290,18 @@ conn_set_client_encoding(connectionObject *self, const char *enc) clean_enc = NULL; } - /* Store the python codec too. */ + /* Store the python encoding name too. */ { - char *tmp = self->codec; - self->codec = codec; + char *tmp = self->pyenc; + self->pyenc = pyenc; PyMem_Free(tmp); - codec = NULL; + pyenc = NULL; } conn_set_fast_codec(self); - Dprintf("conn_set_client_encoding: set encoding to %s (codec: %s)", - self->encoding, self->codec); + Dprintf("conn_set_client_encoding: set encoding to %s (Python: %s)", + self->encoding, self->pyenc); endlock: pthread_mutex_unlock(&self->lock); @@ -1312,7 +1312,7 @@ endlock: exit: PyMem_Free(clean_enc); - PyMem_Free(codec); + PyMem_Free(pyenc); return res; } diff --git a/psycopg/connection_type.c b/psycopg/connection_type.c index 485a92b7..df4ae864 100644 --- a/psycopg/connection_type.c +++ b/psycopg/connection_type.c @@ -1164,7 +1164,7 @@ connection_dealloc(PyObject* obj) PyMem_Free(self->dsn); PyMem_Free(self->encoding); - PyMem_Free(self->codec); + PyMem_Free(self->pyenc); if (self->critical) free(self->critical); if (self->cancel) PQfreeCancel(self->cancel); diff --git a/psycopg/cursor_type.c b/psycopg/cursor_type.c index baa5b8f7..b2aef3df 100644 --- a/psycopg/cursor_type.c +++ b/psycopg/cursor_type.c @@ -286,7 +286,7 @@ static PyObject *_psyco_curs_validate_sql_basic( Py_INCREF(sql); } else if (PyUnicode_Check(sql)) { - char *enc = self->conn->codec; + char *enc = self->conn->pyenc; sql = PyUnicode_AsEncodedString(sql, enc, NULL); /* if there was an error during the encoding from unicode to the target encoding, we just let the exception propagate */ diff --git a/psycopg/error.h b/psycopg/error.h index 9ae6dbd3..8bc4df5e 100644 --- a/psycopg/error.h +++ b/psycopg/error.h @@ -34,7 +34,7 @@ typedef struct { PyObject *pgerror; PyObject *pgcode; cursorObject *cursor; - char *codec; + char *pyenc; PGresult *pgres; } errorObject; diff --git a/psycopg/error_type.c b/psycopg/error_type.c index 75761e81..40b71aa6 100644 --- a/psycopg/error_type.c +++ b/psycopg/error_type.c @@ -43,7 +43,7 @@ error_text_from_chars(errorObject *self, const char *str) return PyString_FromString(str); #else return PyUnicode_Decode(str, strlen(str), - self->codec ? self->codec : "ascii", "replace"); + self->pyenc ? self->pyenc : "ascii", "replace"); #endif } @@ -113,7 +113,7 @@ error_dealloc(errorObject *self) { PyObject_GC_UnTrack((PyObject *)self); error_clear(self); - PyMem_Free(self->codec); + PyMem_Free(self->pyenc); CLEARPGRES(self->pgres); Py_TYPE(self)->tp_free((PyObject *)self); diff --git a/psycopg/lobject_type.c b/psycopg/lobject_type.c index ddda0daf..61c92324 100644 --- a/psycopg/lobject_type.c +++ b/psycopg/lobject_type.c @@ -86,7 +86,7 @@ psyco_lobj_write(lobjectObject *self, PyObject *args) data = obj; } else if (PyUnicode_Check(obj)) { - if (!(data = PyUnicode_AsEncodedString(obj, self->conn->codec, NULL))) { + if (!(data = PyUnicode_AsEncodedString(obj, self->conn->pyenc, NULL))) { goto exit; } } @@ -150,7 +150,7 @@ psyco_lobj_read(lobjectObject *self, PyObject *args) if (self->mode & LOBJECT_BINARY) { res = Bytes_FromStringAndSize(buffer, size); } else { - res = PyUnicode_Decode(buffer, size, self->conn->codec, NULL); + res = PyUnicode_Decode(buffer, size, self->conn->pyenc, NULL); } PyMem_Free(buffer); diff --git a/psycopg/microprotocols.c b/psycopg/microprotocols.c index 1687bc26..7bd33745 100644 --- a/psycopg/microprotocols.c +++ b/psycopg/microprotocols.c @@ -251,9 +251,9 @@ microprotocol_getquoted(PyObject *obj, connectionObject *conn) /* Convert to bytes. */ if (res && PyUnicode_CheckExact(res)) { PyObject *b; - const char *codec; - codec = (conn && conn->codec) ? conn->codec : "utf8"; - b = PyUnicode_AsEncodedString(res, codec, NULL); + const char *pyenc; + pyenc = (conn && conn->pyenc) ? conn->pyenc : "utf8"; + b = PyUnicode_AsEncodedString(res, pyenc, NULL); Py_DECREF(res); res = b; } diff --git a/psycopg/pqpath.c b/psycopg/pqpath.c index d02cb708..9dbd489a 100644 --- a/psycopg/pqpath.c +++ b/psycopg/pqpath.c @@ -226,8 +226,8 @@ pq_raise(connectionObject *conn, cursorObject *curs, PGresult **pgres) if (pyerr && PyObject_TypeCheck(pyerr, &errorType)) { errorObject *perr = (errorObject *)pyerr; - PyMem_Free(perr->codec); - psycopg_strdup(&perr->codec, conn->codec, 0); + PyMem_Free(perr->pyenc); + psycopg_strdup(&perr->pyenc, conn->pyenc, 0); Py_CLEAR(perr->pgerror); perr->pgerror = error_text_from_chars(perr, err); @@ -1332,8 +1332,8 @@ _pq_copy_in_v3(cursorObject *curs) /* a file may return unicode if implements io.TextIOBase */ if (PyUnicode_Check(o)) { PyObject *tmp; - Dprintf("_pq_copy_in_v3: encoding in %s", curs->conn->codec); - if (!(tmp = PyUnicode_AsEncodedString(o, curs->conn->codec, NULL))) { + Dprintf("_pq_copy_in_v3: encoding in %s", curs->conn->pyenc); + if (!(tmp = PyUnicode_AsEncodedString(o, curs->conn->pyenc, NULL))) { Dprintf("_pq_copy_in_v3: encoding() failed"); error = 1; break; @@ -1488,7 +1488,7 @@ _pq_copy_out_v3(cursorObject *curs) if (len > 0 && buffer) { if (is_text) { - obj = PyUnicode_Decode(buffer, len, curs->conn->codec, NULL); + obj = PyUnicode_Decode(buffer, len, curs->conn->pyenc, NULL); } else { obj = Bytes_FromStringAndSize(buffer, len); } @@ -1638,7 +1638,7 @@ retry: Dprintf("pq_read_replication_message: >>%.*s<<", data_size, buffer + hdr); if (repl->decode) { - str = PyUnicode_Decode(buffer + hdr, data_size, conn->codec, NULL); + str = PyUnicode_Decode(buffer + hdr, data_size, conn->pyenc, NULL); } else { str = Bytes_FromStringAndSize(buffer + hdr, data_size); } diff --git a/psycopg/typecast.c b/psycopg/typecast.c index 1cae869f..d83c390b 100644 --- a/psycopg/typecast.c +++ b/psycopg/typecast.c @@ -672,7 +672,7 @@ typecast_cast(PyObject *obj, const char *str, Py_ssize_t len, PyObject *curs) s = PyString_FromStringAndSize(str, len); #else s = PyUnicode_Decode(str, len, - ((cursorObject *)curs)->conn->codec, NULL); + ((cursorObject *)curs)->conn->pyenc, NULL); #endif } else { diff --git a/psycopg/typecast_basic.c b/psycopg/typecast_basic.c index 760555ef..d55820c2 100644 --- a/psycopg/typecast_basic.c +++ b/psycopg/typecast_basic.c @@ -102,7 +102,7 @@ typecast_UNICODE_cast(const char *s, Py_ssize_t len, PyObject *curs) return conn->cdecoder(s, len, NULL); } else { - return PyUnicode_Decode(s, len, conn->codec, NULL); + return PyUnicode_Decode(s, len, conn->pyenc, NULL); } } From 17a74cc77126a15d289d2b568fcf1251b05bcbb3 Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Mon, 26 Dec 2016 16:41:46 +0100 Subject: [PATCH 3/8] Setting connection encoding refactored Code paths to read encoding on connection and to store the new connection in the structure after changing it in the backend unified into a single function. --- psycopg/connection_int.c | 109 ++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/psycopg/connection_int.c b/psycopg/connection_int.c index a180460a..5700b971 100644 --- a/psycopg/connection_int.c +++ b/psycopg/connection_int.c @@ -362,7 +362,7 @@ exit: } -/* set fast access functions according to the currently selected codec +/* set fast access functions according to the currently selected encoding */ void conn_set_fast_codec(connectionObject *self) @@ -386,46 +386,36 @@ conn_set_fast_codec(connectionObject *self) } -/* Read the client encoding from the connection. +/* Store the encoding in the pgconn->encoding field and set the other related + * encoding fields in the connection structure. * - * Store the encoding in the pgconn->encoding field and the name of the - * matching python encoding in pyenc. The buffers are allocated on the Python - * heap. - * - * Return 0 on success, else nonzero. + * Return 0 on success, else -1. */ RAISES_NEG static int -conn_read_encoding(connectionObject *self, PGconn *pgconn) +conn_set_encoding(connectionObject *self, const char *encoding) { - char *pgenc = NULL, *pyenc = NULL; - const char *tmp; int rv = -1; + char *pgenc = NULL, *pyenc = NULL; - tmp = PQparameterStatus(pgconn, "client_encoding"); - Dprintf("conn_connect: client encoding: %s", tmp ? tmp : "(none)"); - if (!tmp) { - PyErr_SetString(OperationalError, - "server didn't return client encoding"); - goto exit; - } - - if (0 > clear_encoding_name(tmp, &pgenc)) { - goto exit; - } + if (0 > clear_encoding_name(encoding, &pgenc)) { goto exit; } /* Look for this encoding in Python codecs. */ - if (0 > conn_pgenc_to_pyenc(pgenc, &pyenc)) { - goto exit; - } + if (0 > conn_pgenc_to_pyenc(pgenc, &pyenc)) { goto exit; } /* Good, success: store the encoding/pyenc in the connection. */ - PyMem_Free(self->encoding); - self->encoding = pgenc; - pgenc = NULL; + { + char *tmp = self->encoding; + self->encoding = pgenc; + PyMem_Free(tmp); + pgenc = NULL; + } - PyMem_Free(self->pyenc); - self->pyenc = pyenc; - pyenc = NULL; + { + char *tmp = self->pyenc; + self->pyenc = pyenc; + PyMem_Free(tmp); + pyenc = NULL; + } conn_set_fast_codec(self); @@ -438,6 +428,35 @@ exit: } +/* Read the client encoding from the backend and store it in the connection. + * + * Return 0 on success, else -1. + */ +RAISES_NEG static int +conn_read_encoding(connectionObject *self, PGconn *pgconn) +{ + const char *encoding; + int rv = -1; + + encoding = PQparameterStatus(pgconn, "client_encoding"); + Dprintf("conn_connect: client encoding: %s", encoding ? encoding : "(none)"); + if (!encoding) { + PyErr_SetString(OperationalError, + "server didn't return client encoding"); + goto exit; + } + + if (0 > conn_set_encoding(self, encoding)) { + goto exit; + } + + rv = 0; + +exit: + return rv; +} + + RAISES_NEG int conn_get_isolation_level(connectionObject *self) { @@ -1282,33 +1301,19 @@ conn_set_client_encoding(connectionObject *self, const char *pgenc) goto endlock; } - /* no error, we can proceed and store the new encoding */ - { - char *tmp = self->encoding; - self->encoding = clean_enc; - PyMem_Free(tmp); - clean_enc = NULL; - } - - /* Store the python encoding name too. */ - { - char *tmp = self->pyenc; - self->pyenc = pyenc; - PyMem_Free(tmp); - pyenc = NULL; - } - - conn_set_fast_codec(self); - - Dprintf("conn_set_client_encoding: set encoding to %s (Python: %s)", - self->encoding, self->pyenc); - endlock: pthread_mutex_unlock(&self->lock); Py_END_ALLOW_THREADS; - if (res < 0) + if (res < 0) { pq_complete_error(self, &pgres, &error); + goto exit; + } + + res = conn_set_encoding(self, pgenc); + + Dprintf("conn_set_client_encoding: set encoding to %s (Python: %s)", + self->encoding, self->pyenc); exit: PyMem_Free(clean_enc); From a255e4e1c6bbe32b0865da410fecd7be067902a7 Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Mon, 26 Dec 2016 17:40:08 +0100 Subject: [PATCH 4/8] Store python encoding and decoding functions in the connection Unused for now: will be used instead of 'pyenc', which is to be dropped. --- psycopg/connection.h | 4 +++ psycopg/connection_int.c | 70 ++++++++++++++++++++++++++++++++++++--- psycopg/connection_type.c | 6 ++++ 3 files changed, 76 insertions(+), 4 deletions(-) diff --git a/psycopg/connection.h b/psycopg/connection.h index d108b71f..32b34fa2 100644 --- a/psycopg/connection.h +++ b/psycopg/connection.h @@ -83,6 +83,7 @@ struct connectionObject { char *dsn; /* data source name */ char *critical; /* critical error on this connection */ char *encoding; /* current backend encoding */ + /* TODO: drop */ char *pyenc; /* connection encoding python name */ long int closed; /* 1 means connection has been closed; @@ -125,6 +126,9 @@ struct connectionObject { /* Pointer to a decoding function, e.g. PyUnicode_DecodeUTF8 */ PyObject *(*cdecoder)(const char *, Py_ssize_t, const char *); + + PyObject *pyencoder; /* python codec encoding function */ + PyObject *pydecoder; /* python codec decoding function */ }; /* map isolation level values into a numeric const */ diff --git a/psycopg/connection_int.c b/psycopg/connection_int.c index 5700b971..83b706ba 100644 --- a/psycopg/connection_int.c +++ b/psycopg/connection_int.c @@ -364,7 +364,7 @@ exit: /* set fast access functions according to the currently selected encoding */ -void +static void conn_set_fast_codec(connectionObject *self) { Dprintf("conn_set_fast_codec: encoding=%s", self->pyenc); @@ -386,21 +386,72 @@ conn_set_fast_codec(connectionObject *self) } +/* Convert a Postgres encoding into Python encoding and decoding functions. + * + * Return 0 on success, else -1 and set an exception. + */ +RAISES_NEG static int +conn_get_python_codec(const char *encoding, PyObject **pyenc, PyObject **pydec) +{ + int rv = -1; + char *pgenc = NULL; + PyObject *encname = NULL; + PyObject *m = NULL, *f = NULL, *codec = NULL; + PyObject *enc_tmp = NULL, *dec_tmp = NULL; + + if (0 > clear_encoding_name(encoding, &pgenc)) { goto exit; } + + /* Find the Py encoding name from the PG encoding */ + if (!(encname = PyDict_GetItemString(psycoEncodings, pgenc))) { + PyErr_Format(OperationalError, + "no Python encoding for PostgreSQL encoding '%s'", pgenc); + goto exit; + } + Py_INCREF(encname); + + /* Look up the python codec */ + if (!(m = PyImport_ImportModule("codecs"))) { goto exit; } + if (!(f = PyObject_GetAttrString(m, "lookup"))) { goto exit; } + if (!(codec = PyObject_CallFunctionObjArgs(f, encname, NULL))) { goto exit; } + if (!(enc_tmp = PyObject_GetAttrString(codec, "encode"))) { goto exit; } + if (!(dec_tmp = PyObject_GetAttrString(codec, "decode"))) { goto exit; } + + /* success */ + *pyenc = enc_tmp; enc_tmp = NULL; + *pydec = dec_tmp; dec_tmp = NULL; + rv = 0; + +exit: + Py_XDECREF(enc_tmp); + Py_XDECREF(dec_tmp); + Py_XDECREF(codec); + Py_XDECREF(f); + Py_XDECREF(m); + Py_XDECREF(encname); + PyMem_Free(pgenc); + + return rv; +} + + /* Store the encoding in the pgconn->encoding field and set the other related * encoding fields in the connection structure. * - * Return 0 on success, else -1. + * Return 0 on success, else -1 and set an exception. */ RAISES_NEG static int conn_set_encoding(connectionObject *self, const char *encoding) { int rv = -1; char *pgenc = NULL, *pyenc = NULL; + PyObject *enc_tmp = NULL, *dec_tmp = NULL; - if (0 > clear_encoding_name(encoding, &pgenc)) { goto exit; } + if (0 > clear_encoding_name(encoding, &pgenc)) { goto exit; } /* TODO: drop */ /* Look for this encoding in Python codecs. */ - if (0 > conn_pgenc_to_pyenc(pgenc, &pyenc)) { goto exit; } + if (0 > conn_pgenc_to_pyenc(pgenc, &pyenc)) { goto exit; } /* TODO: drop */ + + if (0 > conn_get_python_codec(encoding, &enc_tmp, &dec_tmp)) { goto exit; } /* Good, success: store the encoding/pyenc in the connection. */ { @@ -411,17 +462,28 @@ conn_set_encoding(connectionObject *self, const char *encoding) } { + /* TODO: drop */ char *tmp = self->pyenc; self->pyenc = pyenc; PyMem_Free(tmp); pyenc = NULL; } + Py_CLEAR(self->pyencoder); + self->pyencoder = enc_tmp; + enc_tmp = NULL; + + Py_CLEAR(self->pydecoder); + self->pydecoder = dec_tmp; + dec_tmp = NULL; + conn_set_fast_codec(self); rv = 0; exit: + Py_XDECREF(enc_tmp); + Py_XDECREF(dec_tmp); PyMem_Free(pgenc); PyMem_Free(pyenc); return rv; diff --git a/psycopg/connection_type.c b/psycopg/connection_type.c index df4ae864..d22ceb97 100644 --- a/psycopg/connection_type.c +++ b/psycopg/connection_type.c @@ -1141,6 +1141,9 @@ connection_clear(connectionObject *self) Py_CLEAR(self->notifies); Py_CLEAR(self->string_types); Py_CLEAR(self->binary_types); + Py_CLEAR(self->cursor_factory); + Py_CLEAR(self->pyencoder); + Py_CLEAR(self->pydecoder); return 0; } @@ -1216,6 +1219,9 @@ connection_traverse(connectionObject *self, visitproc visit, void *arg) Py_VISIT(self->notifies); Py_VISIT(self->string_types); Py_VISIT(self->binary_types); + Py_VISIT(self->cursor_factory); + Py_VISIT(self->pyencoder); + Py_VISIT(self->pydecoder); return 0; } From dfe547856ee946163dfdc695723f7ab67865228b Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Mon, 26 Dec 2016 20:01:19 +0100 Subject: [PATCH 5/8] Use -1 instead of 0 to say "calculate the length" in many funcs 0 is a valid length, isn't it? --- psycopg/adapter_qstring.c | 2 +- psycopg/connection_type.c | 2 +- psycopg/cursor_type.c | 14 +++++++------- psycopg/pqpath.c | 4 ++-- psycopg/psycopg.h | 2 +- psycopg/psycopgmodule.c | 8 ++------ psycopg/utils.c | 14 ++++++++------ 7 files changed, 22 insertions(+), 24 deletions(-) diff --git a/psycopg/adapter_qstring.c b/psycopg/adapter_qstring.c index eca42182..febb49ac 100644 --- a/psycopg/adapter_qstring.c +++ b/psycopg/adapter_qstring.c @@ -178,7 +178,7 @@ qstring_set_encoding(qstringObject *self, PyObject *pyenc) Py_INCREF(pyenc); if (!(pyenc = psycopg_ensure_bytes(pyenc))) { goto exit; } if (!(tmp = Bytes_AsString(pyenc))) { goto exit; } - if (0 > psycopg_strdup(&cenc, tmp, 0)) { goto exit; } + if (0 > psycopg_strdup(&cenc, tmp, -1)) { goto exit; } Dprintf("qstring_set_encoding: encoding set to %s", cenc); PyMem_Free((void *)self->encoding); diff --git a/psycopg/connection_type.c b/psycopg/connection_type.c index d22ceb97..ba4e4335 100644 --- a/psycopg/connection_type.c +++ b/psycopg/connection_type.c @@ -1097,7 +1097,7 @@ connection_setup(connectionObject *self, const char *dsn, long int async) self, async, Py_REFCNT(self) ); - if (0 > psycopg_strdup(&self->dsn, dsn, 0)) { goto exit; } + if (0 > psycopg_strdup(&self->dsn, dsn, -1)) { goto exit; } if (!(self->notice_list = PyList_New(0))) { goto exit; } if (!(self->notifies = PyList_New(0))) { goto exit; } self->async = async; diff --git a/psycopg/cursor_type.c b/psycopg/cursor_type.c index b2aef3df..c580daa2 100644 --- a/psycopg/cursor_type.c +++ b/psycopg/cursor_type.c @@ -1079,7 +1079,7 @@ psyco_curs_callproc(cursorObject *self, PyObject *args) if (!(cpname = Bytes_AsString(pname))) { goto exit; } if (!(scpnames[i] = psycopg_escape_identifier( - self->conn, cpname, 0))) { + self->conn, cpname, -1))) { Py_CLEAR(pname); goto exit; } @@ -1457,12 +1457,12 @@ psyco_curs_copy_from(cursorObject *self, PyObject *args, PyObject *kwargs) goto exit; if (!(quoted_delimiter = psycopg_escape_string( - self->conn, sep, 0, NULL, NULL))) { + self->conn, sep, -1, NULL, NULL))) { goto exit; } if (!(quoted_null = psycopg_escape_string( - self->conn, null, 0, NULL, NULL))) { + self->conn, null, -1, NULL, NULL))) { goto exit; } @@ -1551,12 +1551,12 @@ psyco_curs_copy_to(cursorObject *self, PyObject *args, PyObject *kwargs) goto exit; if (!(quoted_delimiter = psycopg_escape_string( - self->conn, sep, 0, NULL, NULL))) { + self->conn, sep, -1, NULL, NULL))) { goto exit; } if (!(quoted_null = psycopg_escape_string( - self->conn, null, 0, NULL, NULL))) { + self->conn, null, -1, NULL, NULL))) { goto exit; } @@ -1899,10 +1899,10 @@ cursor_setup(cursorObject *self, connectionObject *conn, const char *name) Dprintf("cursor_setup: parameters: name = %s, conn = %p", name, conn); if (name) { - if (0 > psycopg_strdup(&self->name, name, 0)) { + if (0 > psycopg_strdup(&self->name, name, -1)) { return -1; } - if (!(self->qname = psycopg_escape_identifier(conn, name, 0))) { + if (!(self->qname = psycopg_escape_identifier(conn, name, -1))) { return -1; } } diff --git a/psycopg/pqpath.c b/psycopg/pqpath.c index 9dbd489a..c8d9c46b 100644 --- a/psycopg/pqpath.c +++ b/psycopg/pqpath.c @@ -227,7 +227,7 @@ pq_raise(connectionObject *conn, cursorObject *curs, PGresult **pgres) errorObject *perr = (errorObject *)pyerr; PyMem_Free(perr->pyenc); - psycopg_strdup(&perr->pyenc, conn->pyenc, 0); + psycopg_strdup(&perr->pyenc, conn->pyenc, -1); Py_CLEAR(perr->pgerror); perr->pgerror = error_text_from_chars(perr, err); @@ -765,7 +765,7 @@ pq_tpc_command_locked(connectionObject *conn, const char *cmd, const char *tid, PyEval_RestoreThread(*tstate); /* convert the xid into the postgres transaction_id and quote it. */ - if (!(etid = psycopg_escape_string(conn, tid, 0, NULL, NULL))) + if (!(etid = psycopg_escape_string(conn, tid, -1, NULL, NULL))) { goto exit; } /* prepare the command to the server */ diff --git a/psycopg/psycopg.h b/psycopg/psycopg.h index 438d7636..fc5b533e 100644 --- a/psycopg/psycopg.h +++ b/psycopg/psycopg.h @@ -129,7 +129,7 @@ RAISES HIDDEN PyObject *psyco_set_error(PyObject *exc, cursorObject *curs, const HIDDEN char *psycopg_escape_string(connectionObject *conn, const char *from, Py_ssize_t len, char *to, Py_ssize_t *tolen); HIDDEN char *psycopg_escape_identifier(connectionObject *conn, - const char *str, size_t len); + const char *str, Py_ssize_t len); HIDDEN int psycopg_strdup(char **to, const char *from, Py_ssize_t len); HIDDEN int psycopg_is_text_file(PyObject *f); diff --git a/psycopg/psycopgmodule.c b/psycopg/psycopgmodule.c index bf7d908a..c4d1517a 100644 --- a/psycopg/psycopgmodule.c +++ b/psycopg/psycopgmodule.c @@ -165,7 +165,6 @@ psyco_quote_ident(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *ident = NULL, *obj = NULL, *result = NULL; connectionObject *conn; - const char *str; char *quoted = NULL; static char *kwlist[] = {"ident", "scope", NULL}; @@ -188,12 +187,9 @@ psyco_quote_ident(PyObject *self, PyObject *args, PyObject *kwargs) Py_INCREF(ident); /* for ensure_bytes */ if (!(ident = psycopg_ensure_bytes(ident))) { goto exit; } - str = Bytes_AS_STRING(ident); + if (!(quoted = psycopg_escape_identifier(conn, + Bytes_AS_STRING(ident), Bytes_GET_SIZE(ident)))) { goto exit; } - quoted = psycopg_escape_identifier(conn, str, strlen(str)); - if (!quoted) { - goto exit; - } result = conn_text_from_chars(conn, quoted); exit: diff --git a/psycopg/utils.c b/psycopg/utils.c index bc6f7bec..85ca9d6c 100644 --- a/psycopg/utils.c +++ b/psycopg/utils.c @@ -40,6 +40,8 @@ * and set an exception. The returned string includes quotes and leading E if * needed. * + * `len` is optional: if < 0 it will be calculated. + * * If tolen is set, it will contain the length of the escaped string, * including quotes. */ @@ -50,7 +52,7 @@ psycopg_escape_string(connectionObject *conn, const char *from, Py_ssize_t len, Py_ssize_t ql; int eq = (conn && (conn->equote)) ? 1 : 0; - if (len == 0) { + if (len < 0) { len = strlen(from); } else if (strchr(from, '\0') != from + len) { PyErr_Format(PyExc_ValueError, "A string literal cannot contain NUL (0x00) characters."); @@ -92,13 +94,13 @@ psycopg_escape_string(connectionObject *conn, const char *from, Py_ssize_t len, /* Escape a string for inclusion in a query as identifier. * - * 'len' is optional: if 0 the length is calculated. + * 'len' is optional: if < 0 it will be calculated. * * Return a string allocated by Postgres: free it using PQfreemem * In case of error set a Python exception. */ char * -psycopg_escape_identifier(connectionObject *conn, const char *str, size_t len) +psycopg_escape_identifier(connectionObject *conn, const char *str, Py_ssize_t len) { char *rv = NULL; @@ -107,7 +109,7 @@ psycopg_escape_identifier(connectionObject *conn, const char *str, size_t len) goto exit; } - if (!len) { len = strlen(str); } + if (len < 0) { len = strlen(str); } rv = PQescapeIdentifier(conn->pgconn, str, len); if (!rv) { @@ -127,7 +129,7 @@ exit: /* Duplicate a string. * * Allocate a new buffer on the Python heap containing the new string. - * 'len' is optional: if 0 the length is calculated. + * 'len' is optional: if < 0 the length is calculated. * * Store the return in 'to' and return 0 in case of success, else return -1 * and raise an exception. @@ -141,7 +143,7 @@ psycopg_strdup(char **to, const char *from, Py_ssize_t len) *to = NULL; return 0; } - if (!len) { len = strlen(from); } + if (len < 0) { len = strlen(from); } if (!(*to = PyMem_Malloc(len + 1))) { PyErr_NoMemory(); return -1; From 3295beb7774098659a40649d65e84f7ae9a4838e Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Mon, 26 Dec 2016 19:47:48 +0100 Subject: [PATCH 6/8] Don't look up for Python encoding Store the encode/decode functions for the right codec in the connection. The Python encoding name has been dropped of the connection to avoid the temptation to use it... --- psycopg/adapter_qstring.c | 39 +++---- psycopg/connection.h | 5 +- psycopg/connection_int.c | 210 ++++++++++++++++++++++---------------- psycopg/connection_type.c | 1 - psycopg/cursor_type.c | 6 +- psycopg/error.h | 2 +- psycopg/error_type.c | 15 +-- psycopg/lobject_type.c | 6 +- psycopg/microprotocols.c | 4 +- psycopg/pqpath.c | 37 +++++-- psycopg/psycopg.h | 2 + psycopg/typecast.c | 3 +- psycopg/typecast_basic.c | 7 +- psycopg/utils.c | 54 ++++++++++ 14 files changed, 231 insertions(+), 160 deletions(-) diff --git a/psycopg/adapter_qstring.c b/psycopg/adapter_qstring.c index febb49ac..73579c57 100644 --- a/psycopg/adapter_qstring.c +++ b/psycopg/adapter_qstring.c @@ -36,20 +36,6 @@ static const char *default_encoding = "latin1"; /* qstring_quote - do the quote process on plain and unicode strings */ -const char * -_qstring_get_encoding(qstringObject *self) -{ - /* if the wrapped object is an unicode object we can encode it to match - conn->encoding but if the encoding is not specified we don't know what - to do and we raise an exception */ - if (self->conn) { - return self->conn->pyenc; - } - else { - return self->encoding ? self->encoding : default_encoding; - } -} - static PyObject * qstring_quote(qstringObject *self) { @@ -59,19 +45,15 @@ qstring_quote(qstringObject *self) const char *encoding; PyObject *rv = NULL; - encoding = _qstring_get_encoding(self); - Dprintf("qstring_quote: encoding to %s", encoding); - if (PyUnicode_Check(self->wrapped)) { - if (encoding) { - str = PyUnicode_AsEncodedString(self->wrapped, encoding, NULL); - Dprintf("qstring_quote: got encoded object at %p", str); - if (str == NULL) goto exit; + if (self->conn) { + if (!(str = conn_encode(self->conn, self->wrapped))) { goto exit; } } else { - PyErr_SetString(PyExc_TypeError, - "missing encoding to encode unicode object"); - goto exit; + encoding = self->encoding ? self->encoding : default_encoding; + if(!(str = PyUnicode_AsEncodedString(self->wrapped, encoding, NULL))) { + goto exit; + } } } @@ -162,9 +144,12 @@ qstring_conform(qstringObject *self, PyObject *args) static PyObject * qstring_get_encoding(qstringObject *self) { - const char *encoding; - encoding = _qstring_get_encoding(self); - return Text_FromUTF8(encoding); + if (self->conn) { + return conn_pgenc_to_pyenc(self->conn->encoding, NULL); + } + else { + return Text_FromUTF8(self->encoding ? self->encoding : default_encoding); + } } static int diff --git a/psycopg/connection.h b/psycopg/connection.h index 32b34fa2..6c5a5f6d 100644 --- a/psycopg/connection.h +++ b/psycopg/connection.h @@ -83,8 +83,6 @@ struct connectionObject { char *dsn; /* data source name */ char *critical; /* critical error on this connection */ char *encoding; /* current backend encoding */ - /* TODO: drop */ - char *pyenc; /* connection encoding python name */ long int closed; /* 1 means connection has been closed; 2 that something horrible happened */ @@ -139,7 +137,10 @@ typedef struct { /* C-callable functions in connection_int.c and connection_ext.c */ HIDDEN PyObject *conn_text_from_chars(connectionObject *pgconn, const char *str); +HIDDEN PyObject *conn_encode(connectionObject *self, PyObject *b); +HIDDEN PyObject *conn_decode(connectionObject *self, const char *str, Py_ssize_t len); HIDDEN int conn_get_standard_conforming_strings(PGconn *pgconn); +HIDDEN PyObject *conn_pgenc_to_pyenc(const char *encoding, char **clean_encoding); RAISES_NEG HIDDEN int conn_get_isolation_level(connectionObject *self); HIDDEN int conn_get_protocol_version(PGconn *pgconn); HIDDEN int conn_get_server_version(PGconn *pgconn); diff --git a/psycopg/connection_int.c b/psycopg/connection_int.c index 83b706ba..38688d30 100644 --- a/psycopg/connection_int.c +++ b/psycopg/connection_int.c @@ -58,12 +58,75 @@ const IsolationLevel conn_isolevels[] = { PyObject * conn_text_from_chars(connectionObject *self, const char *str) { -#if PY_MAJOR_VERSION < 3 - return PyString_FromString(str); -#else - const char *pyenc = self ? self->pyenc : "ascii"; - return PyUnicode_Decode(str, strlen(str), pyenc, "replace"); -#endif + return psycopg_text_from_chars_safe(str, -1, self ? self->pydecoder : NULL); +} + + +/* Encode an unicode object into a bytes object in the connection encoding. + * + * If no connection or encoding is available, default to utf8 + */ +PyObject * +conn_encode(connectionObject *self, PyObject *u) +{ + PyObject *t = NULL; + PyObject *rv = NULL; + + if (!(self && self->pyencoder)) { + rv = PyUnicode_AsUTF8String(u); + goto exit; + } + + if (!(t = PyObject_CallFunctionObjArgs(self->pyencoder, u, NULL))) { + goto exit; + } + + if (!(rv = PyTuple_GetItem(t, 0))) { goto exit; } + Py_INCREF(rv); + +exit: + Py_XDECREF(t); + + return rv; +} + + +/* decode a c string into a Python unicode in the connection encoding + * + * len can be < 0: in this case it will be calculated + * + * If no connection or encoding is available, default to utf8 + */ +PyObject * +conn_decode(connectionObject *self, const char *str, Py_ssize_t len) +{ + PyObject *b = NULL; + PyObject *t = NULL; + PyObject *rv = NULL; + + if (len < 0) { len = strlen(str); } + + if (self) { + if (self->cdecoder) { + return self->cdecoder(str, len, NULL); + } + else if (self->pydecoder) { + if (!(b = Bytes_FromStringAndSize(str, len))) { goto exit; } + if (!(t = PyObject_CallFunctionObjArgs(self->pydecoder, b, NULL))) { + goto exit; + } + rv = PyTuple_GetItem(t, 0); + Py_XINCREF(rv); + } + } + else { + return PyUnicode_FromStringAndSize(str, len); + } + +exit: + Py_XDECREF(t); + Py_XDECREF(b); + return rv; } /* conn_notice_callback - process notices */ @@ -321,61 +384,20 @@ exit: return rv; } -/* Convert a PostgreSQL encoding name to a Python encoding name. - * - * Set 'pyenc' to a new copy of the encoding name allocated on the Python heap. - * Return 0 in case of success, else -1 and set an exception. - * - * 'pgenc' should be already normalized (uppercase, no - or _). - */ -RAISES_NEG static int -conn_pgenc_to_pyenc(const char *pgenc, char **pyenc) -{ - char *tmp; - Py_ssize_t size; - PyObject *opyenc = NULL; - int rv = -1; - - /* Find the Py encoding name from the PG encoding */ - if (!(opyenc = PyDict_GetItemString(psycoEncodings, pgenc))) { - PyErr_Format(OperationalError, - "no Python encoding for PostgreSQL encoding '%s'", pgenc); - goto exit; - } - - /* Convert the encoding in a bytes string to extract the c string. */ - Py_INCREF(opyenc); - if (!(opyenc = psycopg_ensure_bytes(opyenc))) { - goto exit; - } - - if (-1 == Bytes_AsStringAndSize(opyenc, &tmp, &size)) { - goto exit; - } - - /* have our own copy of the python encoding name */ - rv = psycopg_strdup(pyenc, tmp, size); - -exit: - Py_XDECREF(opyenc); - return rv; -} - - /* set fast access functions according to the currently selected encoding */ static void conn_set_fast_codec(connectionObject *self) { - Dprintf("conn_set_fast_codec: encoding=%s", self->pyenc); + Dprintf("conn_set_fast_codec: encoding=%s", self->encoding); - if (0 == strcmp(self->pyenc, "utf_8")) { + if (0 == strcmp(self->encoding, "UTF8")) { Dprintf("conn_set_fast_codec: PyUnicode_DecodeUTF8"); self->cdecoder = PyUnicode_DecodeUTF8; return; } - if (0 == strcmp(self->pyenc, "iso8859_1")) { + if (0 == strcmp(self->encoding, "LATIN1")) { Dprintf("conn_set_fast_codec: PyUnicode_DecodeLatin1"); self->cdecoder = PyUnicode_DecodeLatin1; return; @@ -386,12 +408,45 @@ conn_set_fast_codec(connectionObject *self) } +/* Return the Python encoding from a PostgreSQL encoding. + * + * Optionally return the clean version of the postgres encoding too + */ +PyObject * +conn_pgenc_to_pyenc(const char *encoding, char **clean_encoding) +{ + char *pgenc = NULL; + PyObject *rv = NULL; + + if (0 > clear_encoding_name(encoding, &pgenc)) { goto exit; } + if (!(rv = PyDict_GetItemString(psycoEncodings, pgenc))) { + PyErr_Format(OperationalError, + "no Python encoding for PostgreSQL encoding '%s'", pgenc); + goto exit; + } + Py_INCREF(rv); + + if (clean_encoding) { + *clean_encoding = pgenc; + } + else { + PyMem_Free(pgenc); + } + +exit: + return rv; +} + /* Convert a Postgres encoding into Python encoding and decoding functions. + * + * Set clean_encoding to a clean version of the Postgres encoding name + * and pyenc and pydec to python codec functions. * * Return 0 on success, else -1 and set an exception. */ RAISES_NEG static int -conn_get_python_codec(const char *encoding, PyObject **pyenc, PyObject **pydec) +conn_get_python_codec(const char *encoding, + char **clean_encoding, PyObject **pyenc, PyObject **pydec) { int rv = -1; char *pgenc = NULL; @@ -399,15 +454,7 @@ conn_get_python_codec(const char *encoding, PyObject **pyenc, PyObject **pydec) PyObject *m = NULL, *f = NULL, *codec = NULL; PyObject *enc_tmp = NULL, *dec_tmp = NULL; - if (0 > clear_encoding_name(encoding, &pgenc)) { goto exit; } - - /* Find the Py encoding name from the PG encoding */ - if (!(encname = PyDict_GetItemString(psycoEncodings, pgenc))) { - PyErr_Format(OperationalError, - "no Python encoding for PostgreSQL encoding '%s'", pgenc); - goto exit; - } - Py_INCREF(encname); + if (!(encname = conn_pgenc_to_pyenc(encoding, &pgenc))) { goto exit; } /* Look up the python codec */ if (!(m = PyImport_ImportModule("codecs"))) { goto exit; } @@ -419,6 +466,7 @@ conn_get_python_codec(const char *encoding, PyObject **pyenc, PyObject **pydec) /* success */ *pyenc = enc_tmp; enc_tmp = NULL; *pydec = dec_tmp; dec_tmp = NULL; + *clean_encoding = pgenc; pgenc = NULL; rv = 0; exit: @@ -440,20 +488,17 @@ exit: * Return 0 on success, else -1 and set an exception. */ RAISES_NEG static int -conn_set_encoding(connectionObject *self, const char *encoding) +conn_store_encoding(connectionObject *self, const char *encoding) { int rv = -1; - char *pgenc = NULL, *pyenc = NULL; + char *pgenc = NULL; PyObject *enc_tmp = NULL, *dec_tmp = NULL; - if (0 > clear_encoding_name(encoding, &pgenc)) { goto exit; } /* TODO: drop */ + if (0 > conn_get_python_codec(encoding, &pgenc, &enc_tmp, &dec_tmp)) { + goto exit; + } - /* Look for this encoding in Python codecs. */ - if (0 > conn_pgenc_to_pyenc(pgenc, &pyenc)) { goto exit; } /* TODO: drop */ - - if (0 > conn_get_python_codec(encoding, &enc_tmp, &dec_tmp)) { goto exit; } - - /* Good, success: store the encoding/pyenc in the connection. */ + /* Good, success: store the encoding/codec in the connection. */ { char *tmp = self->encoding; self->encoding = pgenc; @@ -461,14 +506,6 @@ conn_set_encoding(connectionObject *self, const char *encoding) pgenc = NULL; } - { - /* TODO: drop */ - char *tmp = self->pyenc; - self->pyenc = pyenc; - PyMem_Free(tmp); - pyenc = NULL; - } - Py_CLEAR(self->pyencoder); self->pyencoder = enc_tmp; enc_tmp = NULL; @@ -485,7 +522,6 @@ exit: Py_XDECREF(enc_tmp); Py_XDECREF(dec_tmp); PyMem_Free(pgenc); - PyMem_Free(pyenc); return rv; } @@ -508,7 +544,7 @@ conn_read_encoding(connectionObject *self, PGconn *pgconn) goto exit; } - if (0 > conn_set_encoding(self, encoding)) { + if (0 > conn_store_encoding(self, encoding)) { goto exit; } @@ -1338,16 +1374,14 @@ conn_set_client_encoding(connectionObject *self, const char *pgenc) PGresult *pgres = NULL; char *error = NULL; int res = -1; - char *pyenc = NULL; char *clean_enc = NULL; - /* If the current encoding is equal to the requested one we don't - issue any query to the backend */ - if (strcmp(self->encoding, pgenc) == 0) return 0; - /* We must know what python encoding this encoding is. */ if (0 > clear_encoding_name(pgenc, &clean_enc)) { goto exit; } - if (0 > conn_pgenc_to_pyenc(clean_enc, &pyenc)) { goto exit; } + + /* If the current encoding is equal to the requested one we don't + issue any query to the backend */ + if (strcmp(self->encoding, clean_enc) == 0) return 0; Py_BEGIN_ALLOW_THREADS; pthread_mutex_lock(&self->lock); @@ -1372,14 +1406,12 @@ endlock: goto exit; } - res = conn_set_encoding(self, pgenc); + res = conn_store_encoding(self, pgenc); - Dprintf("conn_set_client_encoding: set encoding to %s (Python: %s)", - self->encoding, self->pyenc); + Dprintf("conn_set_client_encoding: encoding set to %s", self->encoding); exit: PyMem_Free(clean_enc); - PyMem_Free(pyenc); return res; } diff --git a/psycopg/connection_type.c b/psycopg/connection_type.c index ba4e4335..7401bc14 100644 --- a/psycopg/connection_type.c +++ b/psycopg/connection_type.c @@ -1167,7 +1167,6 @@ connection_dealloc(PyObject* obj) PyMem_Free(self->dsn); PyMem_Free(self->encoding); - PyMem_Free(self->pyenc); if (self->critical) free(self->critical); if (self->cancel) PQfreeCancel(self->cancel); diff --git a/psycopg/cursor_type.c b/psycopg/cursor_type.c index c580daa2..a7303c68 100644 --- a/psycopg/cursor_type.c +++ b/psycopg/cursor_type.c @@ -286,11 +286,7 @@ static PyObject *_psyco_curs_validate_sql_basic( Py_INCREF(sql); } else if (PyUnicode_Check(sql)) { - char *enc = self->conn->pyenc; - sql = PyUnicode_AsEncodedString(sql, enc, NULL); - /* if there was an error during the encoding from unicode to the - target encoding, we just let the exception propagate */ - if (sql == NULL) { goto fail; } + if (!(sql = conn_encode(self->conn, sql))) { goto fail; } } else { /* the is not unicode or string, raise an error */ diff --git a/psycopg/error.h b/psycopg/error.h index 8bc4df5e..275a7ce7 100644 --- a/psycopg/error.h +++ b/psycopg/error.h @@ -34,7 +34,7 @@ typedef struct { PyObject *pgerror; PyObject *pgcode; cursorObject *cursor; - char *pyenc; + PyObject *pydecoder; PGresult *pgres; } errorObject; diff --git a/psycopg/error_type.c b/psycopg/error_type.c index 40b71aa6..4ab21915 100644 --- a/psycopg/error_type.c +++ b/psycopg/error_type.c @@ -34,17 +34,7 @@ PyObject * error_text_from_chars(errorObject *self, const char *str) { - if (str == NULL) { - Py_INCREF(Py_None); - return (Py_None); - } - -#if PY_MAJOR_VERSION < 3 - return PyString_FromString(str); -#else - return PyUnicode_Decode(str, strlen(str), - self->pyenc ? self->pyenc : "ascii", "replace"); -#endif + return psycopg_text_from_chars_safe(str, -1, self->pydecoder); } @@ -93,6 +83,7 @@ error_traverse(errorObject *self, visitproc visit, void *arg) Py_VISIT(self->pgerror); Py_VISIT(self->pgcode); Py_VISIT(self->cursor); + Py_VISIT(self->pydecoder); return ((PyTypeObject *)PyExc_StandardError)->tp_traverse( (PyObject *)self, visit, arg); @@ -104,6 +95,7 @@ error_clear(errorObject *self) Py_CLEAR(self->pgerror); Py_CLEAR(self->pgcode); Py_CLEAR(self->cursor); + Py_CLEAR(self->pydecoder); return ((PyTypeObject *)PyExc_StandardError)->tp_clear((PyObject *)self); } @@ -113,7 +105,6 @@ error_dealloc(errorObject *self) { PyObject_GC_UnTrack((PyObject *)self); error_clear(self); - PyMem_Free(self->pyenc); CLEARPGRES(self->pgres); Py_TYPE(self)->tp_free((PyObject *)self); diff --git a/psycopg/lobject_type.c b/psycopg/lobject_type.c index 61c92324..54f3a4be 100644 --- a/psycopg/lobject_type.c +++ b/psycopg/lobject_type.c @@ -86,9 +86,7 @@ psyco_lobj_write(lobjectObject *self, PyObject *args) data = obj; } else if (PyUnicode_Check(obj)) { - if (!(data = PyUnicode_AsEncodedString(obj, self->conn->pyenc, NULL))) { - goto exit; - } + if (!(data = conn_encode(self->conn, obj))) { goto exit; } } else { PyErr_Format(PyExc_TypeError, @@ -150,7 +148,7 @@ psyco_lobj_read(lobjectObject *self, PyObject *args) if (self->mode & LOBJECT_BINARY) { res = Bytes_FromStringAndSize(buffer, size); } else { - res = PyUnicode_Decode(buffer, size, self->conn->pyenc, NULL); + res = conn_decode(self->conn, buffer, size); } PyMem_Free(buffer); diff --git a/psycopg/microprotocols.c b/psycopg/microprotocols.c index 7bd33745..3ddcc485 100644 --- a/psycopg/microprotocols.c +++ b/psycopg/microprotocols.c @@ -251,9 +251,7 @@ microprotocol_getquoted(PyObject *obj, connectionObject *conn) /* Convert to bytes. */ if (res && PyUnicode_CheckExact(res)) { PyObject *b; - const char *pyenc; - pyenc = (conn && conn->pyenc) ? conn->pyenc : "utf8"; - b = PyUnicode_AsEncodedString(res, pyenc, NULL); + b = conn_encode(conn, res); Py_DECREF(res); res = b; } diff --git a/psycopg/pqpath.c b/psycopg/pqpath.c index c8d9c46b..328a2b26 100644 --- a/psycopg/pqpath.c +++ b/psycopg/pqpath.c @@ -167,6 +167,7 @@ pq_raise(connectionObject *conn, cursorObject *curs, PGresult **pgres) const char *err2 = NULL; const char *code = NULL; PyObject *pyerr = NULL; + PyObject *pgerror = NULL, *pgcode = NULL; if (conn == NULL) { PyErr_SetString(DatabaseError, @@ -221,19 +222,37 @@ pq_raise(connectionObject *conn, cursorObject *curs, PGresult **pgres) err2 = strip_severity(err); Dprintf("pq_raise: err2=%s", err2); + /* decode now the details of the error, because after psyco_set_error + * decoding will fail. + */ + if (!(pgerror = conn_text_from_chars(conn, err))) { + /* we can't really handle an exception while handling this error + * so just print it. */ + PyErr_Print(); + PyErr_Clear(); + } + + if (!(pgcode = conn_text_from_chars(conn, code))) { + PyErr_Print(); + PyErr_Clear(); + } + pyerr = psyco_set_error(exc, curs, err2); if (pyerr && PyObject_TypeCheck(pyerr, &errorType)) { errorObject *perr = (errorObject *)pyerr; - PyMem_Free(perr->pyenc); - psycopg_strdup(&perr->pyenc, conn->pyenc, -1); + Py_CLEAR(perr->pydecoder); + Py_XINCREF(conn->pydecoder); + perr->pydecoder = conn->pydecoder; Py_CLEAR(perr->pgerror); - perr->pgerror = error_text_from_chars(perr, err); + perr->pgerror = pgerror; + pgerror = NULL; Py_CLEAR(perr->pgcode); - perr->pgcode = error_text_from_chars(perr, code); + perr->pgcode = pgcode; + pgcode = NULL; CLEARPGRES(perr->pgres); if (pgres && *pgres) { @@ -241,6 +260,9 @@ pq_raise(connectionObject *conn, cursorObject *curs, PGresult **pgres) *pgres = NULL; } } + + Py_XDECREF(pgerror); + Py_XDECREF(pgcode); } /* pq_set_critical, pq_resolve_critical - manage critical errors @@ -1332,8 +1354,7 @@ _pq_copy_in_v3(cursorObject *curs) /* a file may return unicode if implements io.TextIOBase */ if (PyUnicode_Check(o)) { PyObject *tmp; - Dprintf("_pq_copy_in_v3: encoding in %s", curs->conn->pyenc); - if (!(tmp = PyUnicode_AsEncodedString(o, curs->conn->pyenc, NULL))) { + if (!(tmp = conn_encode(curs->conn, o))) { Dprintf("_pq_copy_in_v3: encoding() failed"); error = 1; break; @@ -1488,7 +1509,7 @@ _pq_copy_out_v3(cursorObject *curs) if (len > 0 && buffer) { if (is_text) { - obj = PyUnicode_Decode(buffer, len, curs->conn->pyenc, NULL); + obj = conn_decode(curs->conn, buffer, len); } else { obj = Bytes_FromStringAndSize(buffer, len); } @@ -1638,7 +1659,7 @@ retry: Dprintf("pq_read_replication_message: >>%.*s<<", data_size, buffer + hdr); if (repl->decode) { - str = PyUnicode_Decode(buffer + hdr, data_size, conn->pyenc, NULL); + str = conn_decode(conn, buffer + hdr, data_size); } else { str = Bytes_FromStringAndSize(buffer + hdr, data_size); } diff --git a/psycopg/psycopg.h b/psycopg/psycopg.h index fc5b533e..13673540 100644 --- a/psycopg/psycopg.h +++ b/psycopg/psycopg.h @@ -132,6 +132,8 @@ HIDDEN char *psycopg_escape_identifier(connectionObject *conn, const char *str, Py_ssize_t len); HIDDEN int psycopg_strdup(char **to, const char *from, Py_ssize_t len); HIDDEN int psycopg_is_text_file(PyObject *f); +HIDDEN PyObject *psycopg_text_from_chars_safe( + const char *str, Py_ssize_t len, PyObject *decoder); STEALS(1) HIDDEN PyObject * psycopg_ensure_bytes(PyObject *obj); diff --git a/psycopg/typecast.c b/psycopg/typecast.c index d83c390b..214d3f04 100644 --- a/psycopg/typecast.c +++ b/psycopg/typecast.c @@ -671,8 +671,7 @@ typecast_cast(PyObject *obj, const char *str, Py_ssize_t len, PyObject *curs) #if PY_MAJOR_VERSION < 3 s = PyString_FromStringAndSize(str, len); #else - s = PyUnicode_Decode(str, len, - ((cursorObject *)curs)->conn->pyenc, NULL); + s = conn_decode(((cursorObject *)curs)->conn, str, len); #endif } else { diff --git a/psycopg/typecast_basic.c b/psycopg/typecast_basic.c index d55820c2..db6c5a93 100644 --- a/psycopg/typecast_basic.c +++ b/psycopg/typecast_basic.c @@ -98,12 +98,7 @@ typecast_UNICODE_cast(const char *s, Py_ssize_t len, PyObject *curs) if (s == NULL) { Py_RETURN_NONE; } conn = ((cursorObject*)curs)->conn; - if (conn->cdecoder) { - return conn->cdecoder(s, len, NULL); - } - else { - return PyUnicode_Decode(s, len, conn->pyenc, NULL); - } + return conn_decode(conn, s, len); } /** BOOLEAN - cast boolean value into right python object **/ diff --git a/psycopg/utils.c b/psycopg/utils.c index 85ca9d6c..7f6b6e6e 100644 --- a/psycopg/utils.c +++ b/psycopg/utils.c @@ -278,3 +278,57 @@ exit: return res; } + + +/* Convert a C string into Python Text using a specified codec. + * + * The codec is the python function codec.getdecoder(enc). It is only used on + * Python 3 to return unicode: in Py2 the function returns a string. + * + * len is optional: use -1 to have it calculated by the function. + */ +PyObject * +psycopg_text_from_chars_safe(const char *str, Py_ssize_t len, PyObject *decoder) +{ +#if PY_MAJOR_VERSION < 3 + + if (!str) { Py_RETURN_NONE; } + + if (len < 0) { len = strlen(str); } + + return PyString_FromStringAndSize(str, len); + +#else + + static PyObject *replace = NULL; + PyObject *rv = NULL; + PyObject *b = NULL; + PyObject *t = NULL; + + if (!str) { Py_RETURN_NONE; } + + if (len < 0) { len = strlen(str); } + + if (decoder) { + if (!replace) { + if (!(replace = PyUnicode_FromString("replace"))) { goto exit; } + } + if (!(b = PyBytes_FromStringAndSize(str, len))) { goto exit; } + if (!(t = PyObject_CallFunctionObjArgs(decoder, b, replace, NULL))) { + goto exit; + } + + if (!(rv = PyTuple_GetItem(t, 0))) { goto exit; } + Py_INCREF(rv); + } + else { + rv = PyUnicode_DecodeASCII(str, len, "replace"); + } + +exit: + Py_XDECREF(t); + Py_XDECREF(b); + return rv; + +#endif +} From cb5293be1f12034d9108845e4f968ffbbf08f8dd Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Thu, 29 Dec 2016 21:13:19 +0100 Subject: [PATCH 7/8] Use the proper API functions to look up codec functions --- psycopg/connection.h | 4 +++- psycopg/connection_int.c | 15 +++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/psycopg/connection.h b/psycopg/connection.h index 6c5a5f6d..2e2d51de 100644 --- a/psycopg/connection.h +++ b/psycopg/connection.h @@ -122,9 +122,11 @@ struct connectionObject { PyObject *cursor_factory; /* default cursor factory from cursor() */ - /* Pointer to a decoding function, e.g. PyUnicode_DecodeUTF8 */ + /* Optional pointer to a decoding C function, e.g. PyUnicode_DecodeUTF8 */ PyObject *(*cdecoder)(const char *, Py_ssize_t, const char *); + /* Pointers to python encoding/decoding functions, e.g. + * codecs.getdecoder('utf8') */ PyObject *pyencoder; /* python codec encoding function */ PyObject *pydecoder; /* python codec decoding function */ }; diff --git a/psycopg/connection_int.c b/psycopg/connection_int.c index 38688d30..f92a658e 100644 --- a/psycopg/connection_int.c +++ b/psycopg/connection_int.c @@ -451,17 +451,15 @@ conn_get_python_codec(const char *encoding, int rv = -1; char *pgenc = NULL; PyObject *encname = NULL; - PyObject *m = NULL, *f = NULL, *codec = NULL; PyObject *enc_tmp = NULL, *dec_tmp = NULL; + /* get the Python name of the encoding as a C string */ if (!(encname = conn_pgenc_to_pyenc(encoding, &pgenc))) { goto exit; } + if (!(encname = psycopg_ensure_bytes(encname))) { goto exit; } - /* Look up the python codec */ - if (!(m = PyImport_ImportModule("codecs"))) { goto exit; } - if (!(f = PyObject_GetAttrString(m, "lookup"))) { goto exit; } - if (!(codec = PyObject_CallFunctionObjArgs(f, encname, NULL))) { goto exit; } - if (!(enc_tmp = PyObject_GetAttrString(codec, "encode"))) { goto exit; } - if (!(dec_tmp = PyObject_GetAttrString(codec, "decode"))) { goto exit; } + /* Look up the codec functions */ + if (!(enc_tmp = PyCodec_Encoder(Bytes_AS_STRING(encname)))) { goto exit; } + if (!(dec_tmp = PyCodec_Decoder(Bytes_AS_STRING(encname)))) { goto exit; } /* success */ *pyenc = enc_tmp; enc_tmp = NULL; @@ -472,9 +470,6 @@ conn_get_python_codec(const char *encoding, exit: Py_XDECREF(enc_tmp); Py_XDECREF(dec_tmp); - Py_XDECREF(codec); - Py_XDECREF(f); - Py_XDECREF(m); Py_XDECREF(encname); PyMem_Free(pgenc); From f3e47a72ed10102efe1465bc3d62b1d93ea04f00 Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Thu, 29 Dec 2016 20:47:24 +0100 Subject: [PATCH 8/8] Brag about encoding/decoding speedup --- NEWS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS b/NEWS index 6ffa66a9..ecde78d3 100644 --- a/NEWS +++ b/NEWS @@ -25,6 +25,8 @@ New features: - Added `~psycopg2.extensions.quote_ident()` function (:ticket:`#359`). - Added `~connection.get_dsn_parameters()` connection method (:ticket:`#364`). - `~cursor.callproc()` now accepts a dictionary of parameters (:ticket:`#381`). +- Using Python C API decoding functions and codecs caching for faster + unicode encoding/decoding (:ticket:`#473`). Other changes: