Skip to content

Commit

Permalink
gh-129173: Use _PyUnicodeError_GetParams in `PyCodec_SurrogatePassE…
Browse files Browse the repository at this point in the history
…rrors` (GH-129134)
  • Loading branch information
picnixz authored Feb 14, 2025
1 parent 303043f commit 1775091
Showing 1 changed file with 162 additions and 120 deletions.
282 changes: 162 additions & 120 deletions Python/codecs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1095,7 +1095,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
#define ENC_UTF32LE 4

static int
get_standard_encoding(const char *encoding, int *bytelength)
get_standard_encoding_impl(const char *encoding, int *bytelength)
{
if (Py_TOLOWER(encoding[0]) == 'u' &&
Py_TOLOWER(encoding[1]) == 't' &&
Expand Down Expand Up @@ -1153,172 +1153,212 @@ get_standard_encoding(const char *encoding, int *bytelength)
return ENC_UNKNOWN;
}

/* This handler is declared static until someone demonstrates
a need to call it directly. */

static int
get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
{
const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
if (encoding_cstr == NULL) {
return -1;
}
*code = get_standard_encoding_impl(encoding_cstr, bytelength);
return 0;
}


// --- handler: 'surrogatepass' -----------------------------------------------

static PyObject *
PyCodec_SurrogatePassErrors(PyObject *exc)
_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
{
PyObject *restuple;
PyObject *object;
PyObject *encode;
const char *encoding;
int code;
int bytelength;
Py_ssize_t i;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
if (encoding == NULL) {
return NULL;
}
int code, bytelength;
int rc = get_standard_encoding(encoding, &code, &bytelength);
Py_DECREF(encoding);
if (rc < 0) {
return NULL;
}
if (code == ENC_UNKNOWN) {
goto bail;
}

if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
unsigned char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
Py_DECREF(object);
return NULL;
}
if (!(encoding = PyUnicode_AsUTF8(encode))) {
Py_DECREF(object);
Py_DECREF(encode);
return NULL;
}
code = get_standard_encoding(encoding, &bytelength);
Py_DECREF(encode);
if (code == ENC_UNKNOWN) {
/* Not supported, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(object);
return NULL;
}
PyObject *obj;
Py_ssize_t objlen, start, end, slen;
if (_PyUnicodeError_GetParams(exc,
&obj, &objlen,
&start, &end, &slen, false) < 0)
{
return NULL;
}

if (end - start > PY_SSIZE_T_MAX / bytelength)
end = start + PY_SSIZE_T_MAX / bytelength;
res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
if (!res) {
Py_DECREF(object);
return NULL;
if (slen > PY_SSIZE_T_MAX / bytelength) {
end = start + PY_SSIZE_T_MAX / bytelength;
end = Py_MIN(end, objlen);
slen = Py_MAX(0, end - start);
}

PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
if (res == NULL) {
Py_DECREF(obj);
return NULL;
}

unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
for (Py_ssize_t i = start; i < end; i++) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* Not a surrogate, fail with original exception */
Py_DECREF(obj);
Py_DECREF(res);
goto bail;
}
outp = (unsigned char*)PyBytes_AsString(res);
for (i = start; i < end; i++) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* Not a surrogate, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(res);
Py_DECREF(object);
return NULL;
}
switch (code) {
case ENC_UTF8:
switch (code) {
case ENC_UTF8: {
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
break;
case ENC_UTF16LE:
*outp++ = (unsigned char) ch;
}
case ENC_UTF16LE: {
*outp++ = (unsigned char)ch;
*outp++ = (unsigned char)(ch >> 8);
break;
case ENC_UTF16BE:
}
case ENC_UTF16BE: {
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char) ch;
*outp++ = (unsigned char)ch;
break;
case ENC_UTF32LE:
*outp++ = (unsigned char) ch;
}
case ENC_UTF32LE: {
*outp++ = (unsigned char)ch;
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 24);
break;
case ENC_UTF32BE:
}
case ENC_UTF32BE: {
*outp++ = (unsigned char)(ch >> 24);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char) ch;
*outp++ = (unsigned char)ch;
break;
}
}
restuple = Py_BuildValue("(On)", res, end);
Py_DECREF(res);
Py_DECREF(object);
return restuple;
}
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
const unsigned char *p;
Py_UCS4 ch = 0;
if (PyUnicodeDecodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeDecodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
return NULL;
p = (const unsigned char*)PyBytes_AS_STRING(object);
if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
Py_DECREF(object);
return NULL;
}
if (!(encoding = PyUnicode_AsUTF8(encode))) {
Py_DECREF(object);
Py_DECREF(encode);
return NULL;
}
code = get_standard_encoding(encoding, &bytelength);
Py_DECREF(encode);
if (code == ENC_UNKNOWN) {
/* Not supported, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(object);
return NULL;
}

/* Try decoding a single surrogate character. If
there are more, let the codec call us again. */
p += start;
if (PyBytes_GET_SIZE(object) - start >= bytelength) {
switch (code) {
case ENC_UTF8:
Py_DECREF(obj);
PyObject *restuple = Py_BuildValue("(Nn)", res, end);
return restuple;

bail:
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}


static PyObject *
_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
{
PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
if (encoding == NULL) {
return NULL;
}
int code, bytelength;
int rc = get_standard_encoding(encoding, &code, &bytelength);
Py_DECREF(encoding);
if (rc < 0) {
return NULL;
}
if (code == ENC_UNKNOWN) {
goto bail;
}

PyObject *obj;
Py_ssize_t objlen, start, end, slen;
if (_PyUnicodeError_GetParams(exc,
&obj, &objlen,
&start, &end, &slen, true) < 0)
{
return NULL;
}

/* Try decoding a single surrogate character. If
there are more, let the codec call us again. */
Py_UCS4 ch = 0;
const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
p += start;

if (objlen - start >= bytelength) {
switch (code) {
case ENC_UTF8: {
if ((p[0] & 0xf0) == 0xe0 &&
(p[1] & 0xc0) == 0x80 &&
(p[2] & 0xc0) == 0x80) {
(p[2] & 0xc0) == 0x80)
{
/* it's a three-byte code */
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
ch = ((p[0] & 0x0f) << 12) +
((p[1] & 0x3f) << 6) +
(p[2] & 0x3f);
}
break;
case ENC_UTF16LE:
}
case ENC_UTF16LE: {
ch = p[1] << 8 | p[0];
break;
case ENC_UTF16BE:
}
case ENC_UTF16BE: {
ch = p[0] << 8 | p[1];
break;
case ENC_UTF32LE:
}
case ENC_UTF32LE: {
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
break;
case ENC_UTF32BE:
}
case ENC_UTF32BE: {
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
break;
}
}
}
Py_DECREF(obj);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
goto bail;
}

Py_DECREF(object);
if (!Py_UNICODE_IS_SURROGATE(ch)) {
/* it's not a surrogate - fail */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}
res = PyUnicode_FromOrdinal(ch);
if (res == NULL)
return NULL;
return Py_BuildValue("(Nn)", res, start + bytelength);
PyObject *res = PyUnicode_FromOrdinal(ch);
if (res == NULL) {
return NULL;
}
return Py_BuildValue("(Nn)", res, start + bytelength);

bail:
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}


/* This handler is declared static until someone demonstrates
a need to call it directly. */
static PyObject *
PyCodec_SurrogatePassErrors(PyObject *exc)
{
if (_PyIsUnicodeEncodeError(exc)) {
return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
}
else if (_PyIsUnicodeDecodeError(exc)) {
return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
}
else {
wrong_exception_type(exc);
return NULL;
}
}


static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject *exc)
{
Expand Down Expand Up @@ -1438,11 +1478,13 @@ namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
}


static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
static inline PyObject *
surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
{
return PyCodec_SurrogatePassErrors(exc);
}


static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
{
return PyCodec_SurrogateEscapeErrors(exc);
Expand Down

0 comments on commit 1775091

Please sign in to comment.