From 88803695ac3158f76d29ff1cdd3621740e6af53f Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Fri, 8 Apr 2011 11:27:45 +0100 Subject: [PATCH 1/3] Normalize the encoding name at connection The encoding can be set by PGCLIENTENCODING, which may be an alternative spelling. Bug reported by Peter Eisentraut. At this point the idea of considering one of the random spellings such as EUC_CN as somewhat "blessed" is debunked. So just store the cleaned-up version of the encoding in the mapping table. Note that the cleaned-up version was needed by the unicode adapter: this requirement has been surpassed as the connection now contains a copy of the Python codec name set whenever the client encoding is set. --- psycopg/connection_int.c | 51 ++++++++++++++++++++++++++++++++------- psycopg/connection_type.c | 22 ++--------------- tests/test_connection.py | 14 +++++++++++ 3 files changed, 58 insertions(+), 29 deletions(-) diff --git a/psycopg/connection_int.c b/psycopg/connection_int.c index fa714f66..6006b15c 100644 --- a/psycopg/connection_int.c +++ b/psycopg/connection_int.c @@ -236,6 +236,39 @@ conn_get_standard_conforming_strings(PGconn *pgconn) return equote; } + +/* Remove irrelevant chars from encoding name and turn it uppercase. + * + * Return a buffer allocated on Python heap, + * NULL and set an exception on error. + */ +static char * +clean_encoding_name(const char *enc) +{ + const char *i = enc; + char *rv, *j; + + /* convert to upper case and remove '-' and '_' from string */ + if (!(j = rv = PyMem_Malloc(strlen(enc) + 1))) { + PyErr_NoMemory(); + return NULL; + } + + while (*i) { + if (!isalnum(*i)) { + ++i; + } + else { + *j++ = toupper(*i++); + } + } + *j = '\0'; + + Dprintf("clean_encoding_name: %s -> %s", enc, rv); + + return rv; +} + /* Convert a PostgreSQL encoding to a Python codec. * * Return a new copy of the codec name allocated on the Python heap, @@ -246,11 +279,16 @@ conn_encoding_to_codec(const char *enc) { char *tmp; Py_ssize_t size; + char *norm_enc = NULL; PyObject *pyenc = NULL; char *rv = NULL; + if (!(norm_enc = clean_encoding_name(enc))) { + goto exit; + } + /* Find the Py codec name from the PG encoding */ - if (!(pyenc = PyDict_GetItemString(psycoEncodings, enc))) { + if (!(pyenc = PyDict_GetItemString(psycoEncodings, norm_enc))) { PyErr_Format(OperationalError, "no Python codec for client encoding '%s'", enc); goto exit; @@ -270,6 +308,7 @@ conn_encoding_to_codec(const char *enc) rv = psycopg_strdup(tmp, size); exit: + PyMem_Free(norm_enc); Py_XDECREF(pyenc); return rv; } @@ -285,7 +324,7 @@ exit: static int conn_read_encoding(connectionObject *self, PGconn *pgconn) { - char *enc = NULL, *codec = NULL, *j; + char *enc = NULL, *codec = NULL; const char *tmp; int rv = -1; @@ -297,16 +336,10 @@ conn_read_encoding(connectionObject *self, PGconn *pgconn) goto exit; } - if (!(enc = PyMem_Malloc(strlen(tmp)+1))) { - PyErr_NoMemory(); + if (!(enc = psycopg_strdup(tmp, 0))) { goto exit; } - /* turn encoding in uppercase */ - j = enc; - while (*tmp) { *j++ = toupper(*tmp++); } - *j = '\0'; - /* Look for this encoding in Python codecs. */ if (!(codec = conn_encoding_to_codec(enc))) { goto exit; diff --git a/psycopg/connection_type.c b/psycopg/connection_type.c index b0c9ddcc..7ca395dc 100644 --- a/psycopg/connection_type.c +++ b/psycopg/connection_type.c @@ -423,36 +423,18 @@ static PyObject * psyco_conn_set_client_encoding(connectionObject *self, PyObject *args) { const char *enc; - char *buffer, *dest; PyObject *rv = NULL; - Py_ssize_t len; EXC_IF_CONN_CLOSED(self); EXC_IF_CONN_ASYNC(self, set_client_encoding); EXC_IF_TPC_PREPARED(self, set_client_encoding); - if (!PyArg_ParseTuple(args, "s#", &enc, &len)) return NULL; + if (!PyArg_ParseTuple(args, "s", &enc)) return NULL; - /* convert to upper case and remove '-' and '_' from string */ - if (!(dest = buffer = PyMem_Malloc(len+1))) { - return PyErr_NoMemory(); - } - - while (*enc) { - if (*enc == '_' || *enc == '-') { - ++enc; - } - else { - *dest++ = toupper(*enc++); - } - } - *dest = '\0'; - - if (conn_set_client_encoding(self, buffer) == 0) { + if (conn_set_client_encoding(self, enc) == 0) { Py_INCREF(Py_None); rv = Py_None; } - PyMem_Free(buffer); return rv; } diff --git a/tests/test_connection.py b/tests/test_connection.py index e237524b..d9da471f 100755 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -22,6 +22,7 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. +import os import time import threading from testutils import unittest, decorate_all_tests, skip_before_postgres @@ -141,6 +142,19 @@ class ConnectionTests(unittest.TestCase): cur.execute("select 'foo'::text;") self.assertEqual(cur.fetchone()[0], u'foo') + def test_connect_nonnormal_envvar(self): + # We must perform encoding normalization at connection time + self.conn.close() + oldenc = os.environ.get('PGCLIENTENCODING') + os.environ['PGCLIENTENCODING'] = 'utf-8' # malformed spelling + try: + self.conn = psycopg2.connect(dsn) + finally: + if oldenc is not None: + os.environ['PGCLIENTENCODING'] = oldenc + else: + del os.environ['PGCLIENTENCODING'] + def test_weakref(self): from weakref import ref conn = psycopg2.connect(dsn) From 19653a88ec4f85b040ca3054b14887c8105f846d Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Fri, 8 Apr 2011 13:27:11 +0100 Subject: [PATCH 2/3] Store a normalized version of the PG encoding in the connection This way looking up into extensions.encodings will not break. --- psycopg/connection_int.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/psycopg/connection_int.c b/psycopg/connection_int.c index 6006b15c..22c5bc59 100644 --- a/psycopg/connection_int.c +++ b/psycopg/connection_int.c @@ -273,22 +273,19 @@ clean_encoding_name(const char *enc) * * Return a new copy of the codec name allocated on the Python heap, * NULL with exception in case of error. + * + * 'enc' should be already normalized (uppercase, no - or _). */ static char * conn_encoding_to_codec(const char *enc) { char *tmp; Py_ssize_t size; - char *norm_enc = NULL; PyObject *pyenc = NULL; char *rv = NULL; - if (!(norm_enc = clean_encoding_name(enc))) { - goto exit; - } - /* Find the Py codec name from the PG encoding */ - if (!(pyenc = PyDict_GetItemString(psycoEncodings, norm_enc))) { + if (!(pyenc = PyDict_GetItemString(psycoEncodings, enc))) { PyErr_Format(OperationalError, "no Python codec for client encoding '%s'", enc); goto exit; @@ -308,7 +305,6 @@ conn_encoding_to_codec(const char *enc) rv = psycopg_strdup(tmp, size); exit: - PyMem_Free(norm_enc); Py_XDECREF(pyenc); return rv; } @@ -336,7 +332,7 @@ conn_read_encoding(connectionObject *self, PGconn *pgconn) goto exit; } - if (!(enc = psycopg_strdup(tmp, 0))) { + if (!(enc = clean_encoding_name(tmp))) { goto exit; } @@ -998,21 +994,23 @@ conn_set_client_encoding(connectionObject *self, const char *enc) PGresult *pgres = NULL; char *error = NULL; char query[48]; - int res = 0; - char *codec; + int res = 1; + char *codec = NULL; + char *clean_enc = NULL; /* If the current encoding is equal to the requested one we don't issue any query to the backend */ if (strcmp(self->encoding, enc) == 0) return 0; /* We must know what python codec this encoding is. */ - if (!(codec = conn_encoding_to_codec(enc))) { return -1; } + if (!(clean_enc = clean_encoding_name(enc))) { goto exit; } + if (!(codec = conn_encoding_to_codec(clean_enc))) { goto exit; } Py_BEGIN_ALLOW_THREADS; pthread_mutex_lock(&self->lock); /* set encoding, no encoding string is longer than 24 bytes */ - PyOS_snprintf(query, 47, "SET client_encoding = '%s'", enc); + PyOS_snprintf(query, 47, "SET client_encoding = '%s'", clean_enc); /* abort the current transaction, to set the encoding ouside of transactions */ @@ -1027,21 +1025,18 @@ conn_set_client_encoding(connectionObject *self, const char *enc) /* no error, we can proceeed and store the new encoding */ { char *tmp = self->encoding; - self->encoding = NULL; + self->encoding = clean_enc; PyMem_Free(tmp); - } - if (!(self->encoding = psycopg_strdup(enc, 0))) { - res = 1; /* don't call pq_complete_error below */ - goto endlock; + clean_enc = NULL; } /* Store the python codec too. */ { char *tmp = self->codec; - self->codec = NULL; + self->codec = codec; PyMem_Free(tmp); + codec = NULL; } - self->codec = codec; Dprintf("conn_set_client_encoding: set encoding to %s (codec: %s)", self->encoding, self->codec); @@ -1054,6 +1049,10 @@ endlock: if (res < 0) pq_complete_error(self, &pgres, &error); +exit: + PyMem_Free(clean_enc); + PyMem_Free(codec); + return res; } From e3605b33c13ddc4f35d30f8ea9b536c2a9b494c0 Mon Sep 17 00:00:00 2001 From: Daniele Varrazzo Date: Fri, 8 Apr 2011 14:36:49 +0100 Subject: [PATCH 3/3] Updated NEWS with the connection encoding fix --- NEWS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS b/NEWS index b952c274..7e2de335 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ What's new in psycopg 2.4.1 - Use own parser for bytea output, not requiring anymore the libpq 9.0 to parse the hex format. + - Don't fail connection if the client encoding is a non-normalized + variant. Issue reported by Peter Eisentraut. - Correctly detect an empty query sent to the backend (ticket #46). - Allow to specify --static-libpq on setup.py command line instead of just in 'setup.cfg'. Patch provided by Matthew Ryan (ticket #48).