/* Format bytes as hexadecimal */

#include "Python.h"
#include "pycore_strhex.h"        // _Py_strhex_with_sep()
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()

/* Scalar hexlify: convert len bytes to 2*len hex characters.
   Uses table lookup via Py_hexdigits for the conversion. */
static inline void
_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
{
    /* Various optimizations like using math instead of a table lookup,
       manually unrolling the loop, storing the global table pointer locally,
       and doing wider dst writes have been tried and benchmarked; all produced
       nearly identical performance on gcc 15.  Using a 256 entry uint16_t
       table was a bit slower.  So we keep our old simple and obvious code. */
    for (Py_ssize_t i = 0; i < len; i++) {
        unsigned char c = src[i];
        *dst++ = Py_hexdigits[c >> 4];
        *dst++ = Py_hexdigits[c & 0x0f];
    }
}

/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
   Uses __builtin_shufflevector for portable interleave that compiles to
   native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
   NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
   for the target microarch allow it [try -march=native if running 32-bit
   on an RPi3 or later]).

   Performance:
   - For more common small data it varies between 1.1-3x faster.
   - Up to 11x faster on larger data than the scalar code.

   While faster is possible for big data using AVX2 or AVX512, that
   adds a ton of complication. Who ever really hexes huge data?
   The 16-64 byte boosts align nicely with md5 - sha512 hexdigests.
*/
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR

/* 128-bit vector of 16 unsigned bytes */
typedef unsigned char v16u8 __attribute__((vector_size(16)));
/* 128-bit vector of 16 signed bytes - for efficient comparison.
   Using signed comparison generates pcmpgtb on x86-64 instead of
   the slower psubusb+pcmpeqb sequence from unsigned comparison.
   ARM NEON performs the same either way. */
typedef signed char v16s8 __attribute__((vector_size(16)));

/* Splat a byte value across all 16 lanes */
static inline v16u8
v16u8_splat(unsigned char x)
{
    return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
}

static inline v16s8
v16s8_splat(signed char x)
{
    return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
}

/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
   Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
static void
_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
{
    const v16u8 mask_0f = v16u8_splat(0x0f);
    const v16u8 ascii_0 = v16u8_splat('0');
    const v16u8 offset = v16u8_splat('a' - '0' - 10);  /* 0x27 */
    const v16s8 nine = v16s8_splat(9);

    Py_ssize_t i = 0;

    /* Process 16 bytes at a time */
    for (; i + 16 <= len; i += 16, dst += 32) {
        /* Load 16 bytes (memcpy for safe unaligned access) */
        v16u8 data;
        memcpy(&data, src + i, 16);

        /* Extract high and low nibbles using vector operators */
        v16u8 hi = (data >> 4) & mask_0f;
        v16u8 lo = data & mask_0f;

        /* Compare > 9 using signed comparison for efficient codegen.
           Nibble values 0-15 are safely in signed byte range.
           This generates pcmpgtb on x86-64, avoiding the slower
           psubusb+pcmpeqb sequence from unsigned comparison. */
        v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
        v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);

        /* Convert nibbles to hex ASCII */
        hi = hi + ascii_0 + (hi_gt9 & offset);
        lo = lo + ascii_0 + (lo_gt9 & offset);

        /* Interleave hi/lo nibbles using portable shufflevector.
           This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
           or vzip on ARM32. */
        v16u8 result0 = __builtin_shufflevector(hi, lo,
            0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
        v16u8 result1 = __builtin_shufflevector(hi, lo,
            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);

        /* Store 32 hex characters */
        memcpy(dst, &result0, 16);
        memcpy(dst + 16, &result1, 16);
    }

    /* Scalar fallback for remaining 0-15 bytes */
    _Py_hexlify_scalar(src + i, dst, len - i);
}

#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */

static PyObject *
_Py_strhex_impl(const char* argbuf, Py_ssize_t arglen,
                PyObject* sep, Py_ssize_t bytes_per_sep_group,
                int return_bytes)
{
    assert(arglen >= 0);

    Py_UCS1 sep_char = 0;
    if (sep) {
        Py_ssize_t seplen = PyObject_Length((PyObject*)sep);
        if (seplen < 0) {
            return NULL;
        }
        if (seplen != 1) {
            PyErr_SetString(PyExc_ValueError, "sep must be length 1.");
            return NULL;
        }
        if (PyUnicode_Check(sep)) {
            if (PyUnicode_KIND(sep) != PyUnicode_1BYTE_KIND) {
                PyErr_SetString(PyExc_ValueError, "sep must be ASCII.");
                return NULL;
            }
            sep_char = PyUnicode_READ_CHAR(sep, 0);
        }
        else if (PyBytes_Check(sep)) {
            sep_char = PyBytes_AS_STRING(sep)[0];
        }
        else {
            PyErr_SetString(PyExc_TypeError, "sep must be str or bytes.");
            return NULL;
        }
        if (sep_char > 127 && !return_bytes) {
            PyErr_SetString(PyExc_ValueError, "sep must be ASCII.");
            return NULL;
        }
    }
    else {
        bytes_per_sep_group = 0;
    }
    size_t abs_bytes_per_sep = _Py_ABS_CAST(size_t, bytes_per_sep_group);
    Py_ssize_t resultlen = 0;
    if (bytes_per_sep_group && arglen > 0) {
        /* How many sep characters we'll be inserting. */
        resultlen = (arglen - 1) / abs_bytes_per_sep;
    }
    /* Bounds checking for our Py_ssize_t indices. */
    if (arglen >= PY_SSIZE_T_MAX / 2 - resultlen) {
        return PyErr_NoMemory();
    }
    resultlen += arglen * 2;

    if ((size_t)abs_bytes_per_sep >= (size_t)arglen) {
        bytes_per_sep_group = 0;
        abs_bytes_per_sep = 0;
    }

    PyObject *retval;
    Py_UCS1 *retbuf;
    if (return_bytes) {
        /* If _PyBytes_FromSize() were public we could avoid malloc+copy. */
        retval = PyBytes_FromStringAndSize(NULL, resultlen);
        if (!retval) {
            return NULL;
        }
        retbuf = (Py_UCS1 *)PyBytes_AS_STRING(retval);
    }
    else {
        retval = PyUnicode_New(resultlen, 127);
        if (!retval) {
            return NULL;
        }
        retbuf = PyUnicode_1BYTE_DATA(retval);
    }

    /* Hexlify */
    Py_ssize_t i, j;
    unsigned char c;

    if (bytes_per_sep_group == 0) {
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
        if (arglen >= 16) {
            _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
        }
        else
#endif
        {
            _Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen);
        }
    }
    else {
        /* The number of complete chunk+sep periods */
        Py_ssize_t chunks = (arglen - 1) / abs_bytes_per_sep;
        Py_ssize_t chunk;
        size_t k;

        if (bytes_per_sep_group < 0) {
            i = j = 0;
            for (chunk = 0; chunk < chunks; chunk++) {
                for (k = 0; k < abs_bytes_per_sep; k++) {
                    c = argbuf[i++];
                    retbuf[j++] = Py_hexdigits[c >> 4];
                    retbuf[j++] = Py_hexdigits[c & 0x0f];
                }
                retbuf[j++] = sep_char;
            }
            while (i < arglen) {
                c = argbuf[i++];
                retbuf[j++] = Py_hexdigits[c >> 4];
                retbuf[j++] = Py_hexdigits[c & 0x0f];
            }
            assert(j == resultlen);
        }
        else {
            i = arglen - 1;
            j = resultlen - 1;
            for (chunk = 0; chunk < chunks; chunk++) {
                for (k = 0; k < abs_bytes_per_sep; k++) {
                    c = argbuf[i--];
                    retbuf[j--] = Py_hexdigits[c & 0x0f];
                    retbuf[j--] = Py_hexdigits[c >> 4];
                }
                retbuf[j--] = sep_char;
            }
            while (i >= 0) {
                c = argbuf[i--];
                retbuf[j--] = Py_hexdigits[c & 0x0f];
                retbuf[j--] = Py_hexdigits[c >> 4];
            }
            assert(j == -1);
        }
    }

#ifdef Py_DEBUG
    if (!return_bytes) {
        assert(_PyUnicode_CheckConsistency(retval, 1));
    }
#endif

    return retval;
}

PyObject * _Py_strhex(const char* argbuf, Py_ssize_t arglen)
{
    return _Py_strhex_impl(argbuf, arglen, NULL, 0, 0);
}

/* Same as above but returns a bytes() instead of str() to avoid the
 * need to decode the str() when bytes are needed. */
PyObject* _Py_strhex_bytes(const char* argbuf, Py_ssize_t arglen)
{
    return _Py_strhex_impl(argbuf, arglen, NULL, 0, 1);
}

/* These variants include support for a separator between every N bytes: */

PyObject* _Py_strhex_with_sep(const char* argbuf, Py_ssize_t arglen,
                              PyObject* sep, Py_ssize_t bytes_per_group)
{
    return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 0);
}

/* Same as above but returns a bytes() instead of str() to avoid the
 * need to decode the str() when bytes are needed. */
PyObject* _Py_strhex_bytes_with_sep(const char* argbuf, Py_ssize_t arglen,
                                    PyObject* sep, Py_ssize_t bytes_per_group)
{
    return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 1);
}