Solid tokenization code.

This commit is contained in:
Federico Di Gregorio 2005-03-23 17:17:48 +00:00
parent cb9cec57c0
commit 75e7273d85
3 changed files with 81 additions and 63 deletions

View File

@ -1,3 +1,8 @@
2005-03-24 Federico Di Gregorio <fog@debian.org>
* psycopg/typecast_array.c (typecast_array_tokenize): much better
tokenization code.
2005-03-23 Federico Di Gregorio <fog@debian.org> 2005-03-23 Federico Di Gregorio <fog@debian.org>
* psycopg/typecast_basic.c: all the basic casters now respect the * psycopg/typecast_basic.c: all the basic casters now respect the

View File

@ -35,81 +35,98 @@ static int
typecast_array_tokenize(unsigned char *str, int strlength, typecast_array_tokenize(unsigned char *str, int strlength,
int *pos, unsigned char** token, int *length) int *pos, unsigned char** token, int *length)
{ {
int i, l, res = ASCAN_TOKEN; /* FORTRAN glory */
int qs = 0; /* 2 = in quotes, 1 = quotes closed */ int i, j, q, b, l, res;
/* first we check for quotes, used when the content of the item contains Dprintf("typecast_array_tokenize: '%s', %d/%d",
special or quoted characters */ &str[*pos], *pos, strlength);
if (str[*pos] == '"') { /* we always get called with pos pointing at the start of a token, so a
qs = 2; fast check is enough for ASCAN_EOF, ASCAN_BEGIN and ASCAN_END */
if (*pos == strlength) {
return ASCAN_EOF;
}
else if (str[*pos] == '{') {
*pos += 1; *pos += 1;
return ASCAN_BEGIN;
}
else if (str[*pos] == '}') {
*pos += 1;
if (str[*pos] == ',')
*pos += 1;
return ASCAN_END;
} }
Dprintf("typecast_array_tokenize: '%s'; %d/%d", /* now we start looking for the first unquoted ',' or '}', the only two
&str[*pos], *pos, strlength); tokens that can limit an array element */
q = 0; /* if q is odd we're inside quotes */
b = 0; /* if b is 1 we just encountered a backslash */
res = ASCAN_TOKEN;
for (i = *pos ; i < strlength ; i++) { for (i = *pos ; i < strlength ; i++) {
switch (str[i]) { switch (str[i]) {
case '{':
*pos = i+1;
return ASCAN_BEGIN;
case '}':
/* we tokenize the last item in the array and then return it to
the user togheter with the closing bracket marker */
res = ASCAN_END;
goto tokenize;
case '"': case '"':
/* this will close the quoting only if the previous character was if (b == 0)
NOT a backslash */ q += 1;
if (qs == 2 && str[i-1] != '\\') qs = 1; else
continue; b = 0;
break;
case '\\': case '\\':
/* something has been quoted, sigh, we'll need a copy buffer */
res = ASCAN_QUOTED; res = ASCAN_QUOTED;
continue; if (b == 0)
b = 1;
case ',':
/* if we're inside quotes we use the comma as a normal char */
if (qs == 2)
continue;
else else
goto tokenize; /* we're backslashing a backslash */
} b = 0;
} break;
res = ASCAN_EOF; case '}':
case ',':
if (b == 0 && ((q&1) == 0))
goto tokenize;
break;
default:
/* reset the backslash counter */
b = 0;
break;
}
}
tokenize: tokenize:
l = i - *pos - qs; /* remove initial quoting character and calculate raw length */
l = i - *pos;
if (str[*pos] == '"') {
*pos += 1;
l -= 2;
}
/* if res is ASCAN_QUOTED we need to copy the string to a newly allocated
buffer and return it */
if (res == ASCAN_QUOTED) { if (res == ASCAN_QUOTED) {
unsigned char *buffer = PyMem_Malloc(l+1); unsigned char *buffer = PyMem_Malloc(l+1);
if (buffer == NULL) return ASCAN_ERROR; if (buffer == NULL) return ASCAN_ERROR;
*token = buffer; *token = buffer;
for (i = *pos; i < l+*pos; i++) { for (j = *pos; j < *pos+l; j++) {
if (str[i] != '\\') if (str[j] != '\\'
*(buffer++) = str[i]; || (j > *pos && str[j-1] == '\\'))
*(buffer++) = str[j];
} }
*buffer = '\0'; *buffer = '\0';
*length = (int)buffer - (int)*token; *length = (int)buffer - (int)*token;
*pos = i+2;
} }
else { else {
*token = &str[*pos]; *token = &str[*pos];
*length = l; *length = l;
*pos = i+1;
if (res == ASCAN_END && str[*pos] == ',')
*pos += 1; /* skip both the bracket and the comma */
} }
*pos = i;
/* skip the comma and set position to the start of next token */
if (str[i] == ',') *pos += 1;
return res; return res;
} }
@ -117,19 +134,18 @@ static int
typecast_array_scan(unsigned char *str, int strlength, typecast_array_scan(unsigned char *str, int strlength,
PyObject *curs, PyObject *base, PyObject *array) PyObject *curs, PyObject *base, PyObject *array)
{ {
int state, length, bracket = 0, pos = 0; int state, length, pos = 0;
unsigned char *token; unsigned char *token;
PyObject *stack[MAX_DIMENSIONS]; PyObject *stack[MAX_DIMENSIONS];
int stack_index = 0; int stack_index = 0;
while (1) { while (1) {
token = NULL;
state = typecast_array_tokenize(str, strlength, &pos, &token, &length); state = typecast_array_tokenize(str, strlength, &pos, &token, &length);
if (state == ASCAN_TOKEN Dprintf("typecast_array_scan: state = %d, length = %d, token = '%s'",
|| state == ASCAN_QUOTED state, length, token);
|| (state == ASCAN_EOF && bracket == 0) if (state == ASCAN_TOKEN || state == ASCAN_QUOTED) {
|| (state == ASCAN_END && bracket == 0)) {
PyObject *obj = typecast_cast(base, token, length, curs); PyObject *obj = typecast_cast(base, token, length, curs);
/* before anything else we free the memory */ /* before anything else we free the memory */
@ -139,6 +155,7 @@ typecast_array_scan(unsigned char *str, int strlength,
PyList_Append(array, obj); PyList_Append(array, obj);
Py_DECREF(obj); Py_DECREF(obj);
} }
else if (state == ASCAN_BEGIN) { else if (state == ASCAN_BEGIN) {
PyObject *sub = PyList_New(0); PyObject *sub = PyList_New(0);
if (sub == NULL) return 0; if (sub == NULL) return 0;
@ -152,23 +169,19 @@ typecast_array_scan(unsigned char *str, int strlength,
stack[stack_index++] = array; stack[stack_index++] = array;
array = sub; array = sub;
} }
else if (state == ASCAN_ERROR) { else if (state == ASCAN_ERROR) {
return 0; return 0;
} }
/* reset the closing bracket marker just before cheking for ASCAN_END: else if (state == ASCAN_END) {
this is to make sure we don't mistake two closing brackets for an
empty item */
bracket = 0;
if (state == ASCAN_END) {
if (--stack_index < 0) if (--stack_index < 0)
return 0; return 0;
array = stack[stack_index]; array = stack[stack_index];
bracket = 1;
} }
if (state == ASCAN_EOF) break; else if (state == ASCAN_EOF)
break;
} }
return 1; return 1;

View File

@ -16,6 +16,6 @@ print d, '->', d[0], d[1], d[2]
curs.execute("SELECT ARRAY[ARRAY[1,2],ARRAY[3,4]] AS foo") curs.execute("SELECT ARRAY[ARRAY[1,2],ARRAY[3,4]] AS foo")
print curs.fetchone()[0] print curs.fetchone()[0]
curs.execute("SELECT ARRAY['20:00:01'::time] AS foo") curs.execute("SELECT ARRAY[ARRAY[now(), now()], ARRAY[now(), now()]] AS foo")
print curs.description print curs.description
print curs.fetchone()[0] print curs.fetchone()[0]