If you are on a Maxwell or Pascal GPU, it might make sense to rewrite integer-multiply-heavy core computation in terms of the native 16x16+32 bit multiply-add of those architectures. The 32-bit multiplies in your PTX code are emulated for those architectures. FWIW, I am not sure that using a special squaring routine is going to save much (or any) work on architectures with native 32x32+32 bit IMAD support. You might want to try the code below.

```
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include "cuda_profiler_api.h"
__host__ __device__ ulonglong2 umul64wide (uint64_t a, uint64_t b)
{
ulonglong2 res;
// Fermi, Kepler and Volta, Turing provide a native 32-bit IMAD
#if (((__CUDA_ARCH__ >= 200) && (__CUDA_ARCH__ < 500)) || \
((__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 800)))
asm ("{\n\t"
".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;\n\t"
"mov.b64 {alo,ahi}, %2;\n\t"
"mov.b64 {blo,bhi}, %3;\n\t"
"mul.lo.u32 r0, alo, blo;\n\t"
"mul.hi.u32 r1, alo, blo; \n\t"
"mad.lo.cc.u32 r1, alo, bhi, r1;\n\t"
"madc.hi.u32 r2, alo, bhi, 0;\n\t"
"mad.lo.cc.u32 r1, ahi, blo, r1;\n\t"
"madc.hi.cc.u32 r2, ahi, blo, r2;\n\t"
"madc.hi.u32 r3, ahi, bhi, 0;\n\t"
"mad.lo.cc.u32 r2, ahi, bhi, r2;\n\t"
"addc.u32 r3, r3, 0;\n\t"
"mov.b64 %0, {r0,r1};\n\t"
"mov.b64 %1, {r2,r3};\n\t"
"}"
: "=l"(res.x), "=l"(res.y)
: "l"(a), "l"(b));
// Maxwell and Pascal do not provide native a 32-bit IMAD, but provide XMAD
#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
asm ("{\n\t"
".reg .u32 alo, ahi, blo, bhi, r0, r1, r2, r3;\n\t"
".reg .u32 s0, s1, s2, s3, t0, t1, t2, t3;\n\t"
".reg .u16 a0, a1, a2, a3, b0, b1, b2, b3;\n\t"
// split inputs into 16-bit chunks
"mov.b64 {alo,ahi}, %2;\n\t"
"mov.b64 {blo,bhi}, %3;\n\t"
"mov.b32 {a0,a1}, alo;\n\t"
"mov.b32 {a2,a3}, ahi;\n\t"
"mov.b32 {b0,b1}, blo;\n\t"
"mov.b32 {b2,b3}, bhi;\n\t"
// first partial sum:
// a3b3.wide a1b3.wide a0b2.wide a0b0.wide
// 0 a2b2.wide a1b1.wide
// 0 a3b1.wide a2b0.wide
"mul.wide.u16 r0, a0, b0;\n\t"
"mul.wide.u16 r1, a0, b2;\n\t"
"mul.wide.u16 r2, a1, b3;\n\t"
"mul.wide.u16 r3, a3, b3;\n\t"
"mul.wide.u16 t1, a1, b1;\n\t"
"mul.wide.u16 t2, a2, b2;\n\t"
"add.cc.u32 r1, r1, t1;\n\t"
"addc.cc.u32 r2, r2, t2;\n\t"
"addc.u32 r3, r3, 0;\n\t"
"mul.wide.u16 t1, a2, b0;\n\t"
"mul.wide.u16 t2, a3, b1;\n\t"
"add.cc.u32 r1, r1, t1;\n\t"
"addc.cc.u32 r2, r2, t2;\n\t"
"addc.u32 r3, r3, 0;\n\t"
// second partial sum:
// a2b3.wide a0b3.wide a0b1.wide
// a3b2.wide a1b2.wide a1b0.wide
// 0 a2b1.wide
// 0 a3b0.wide
"mul.wide.u16 s0, a0, b1;\n\t"
"mul.wide.u16 s1, a0, b3;\n\t"
"mul.wide.u16 s2, a2, b3;\n\t"
"mul.wide.u16 t1, a2, b1;\n\t"
"add.cc.u32 s1, s1, t1;\n\t"
"addc.u32 s2, s2, 0;\n\t"
"mul.wide.u16 t1, a3, b0;\n\t"
"add.cc.u32 s1, s1, t1;\n\t"
"addc.u32 s2, s2, 0;\n\t"
"mul.wide.u16 t0, a1, b0;\n\t"
"mul.wide.u16 t1, a1, b2;\n\t"
"mul.wide.u16 t2, a3, b2;\n\t"
"add.cc.u32 s0, s0, t0;\n\t"
"addc.cc.u32 s1, s1, t1;\n\t"
"addc.cc.u32 s2, s2, t2;\n\t"
"addc.u32 s3, 0, 0;\n\t"
// offset second partial sum by 16 bits to the left
"shf.l.clamp.b32 t3, s2, s3, 16;\n\t"
"shf.l.clamp.b32 t2, s1, s2, 16;\n\t"
"shf.l.clamp.b32 t1, s0, s1, 16;\n\t"
"shf.l.clamp.b32 t0, 0, s0, 16;\n\t"
// add first sum in r{0,1,2,3} to second sum in t{0,1,2,3}
"add.cc.u32 r0, r0, t0;\n\t"
"addc.cc.u32 r1, r1, t1;\n\t"
"addc.cc.u32 r2, r2, t2;\n\t"
"addc.u32 r3, r3, t3;\n\t"
// pack outputs
"mov.b64 %0, {r0,r1};\n\t"
"mov.b64 %1, {r2,r3};\n\t"
"}"
: "=l"(res.x), "=l"(res.y)
: "l"(a), "l"(b));
#elif defined __CUDA_ARCH__
#error unsupported __CUDA_ARCH__
#else // HOST code
// derived from Henry Warren, "Hacker's Delight 2nd ed.", 2013, figure 8-2
uint64_t u0, v0, u1, v1, w0, w1, w2, t;
u0 = (uint32_t)a; u1 = a >> 32;
v0 = (uint32_t)b; v1 = b >> 32;
w0 = u0 * v0;
t = u1 * v0 + (w0 >> 32);
w1 = (uint32_t)t;
w2 = t >> 32;
w1 = u0 * v1 + w1;
res.y = u1 * v1 + w2 + (w1 >> 32);
res.x = (w1 << 32) + (uint32_t)w0;
#endif
return res;
}
__global__ void mul64wide_test (const uint64_t * __restrict__ a,
const uint64_t * __restrict__ b,
ulonglong2 * __restrict__ r, int n)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < n; i += stride) {
r[i] = umul64wide (a[i], b[i]);
}
}
#define PURELY_RANDOM (1)
#define REPEAT (200)
#define LEN (12000000)
#define THREADS (128)
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaDeviceSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579) /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)
int main (void)
{
uint64_t *a, *b, *d_a = 0, *d_b = 0;
ulonglong2 *r, *d_r = 0;
/* Allocate memory on the host */
a = (uint64_t *)malloc (sizeof(a[0]) * LEN);
b = (uint64_t *)malloc (sizeof(b[0]) * LEN);
r = (ulonglong2 *)malloc (sizeof(r[0]) * LEN);
/* Allocate memory on device */
CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0])*LEN));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0])*LEN));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_r, sizeof(d_r[0])*LEN));
for (int j = 0; j < REPEAT; j++) {
printf ("\r%d", j);
for (int i = 0; i < LEN; i++) {
#if PURELY_RANDOM
a[i] = KISS;
a[i] = (a[i] << 32) + KISS;
b[i] = KISS;
b[i] = (b[i] << 32) + KISS;
#else // PURELY_RANDOM
int s1, s2, d;
s1 = KISS & 63; s2 = KISS & 63; d = KISS & 1;
a[i] = (d) ? ((0x1ULL << s1) - (0x1ULL << s2)) :
((0x1ULL << s1) + (0x1ULL << s2));
s1 = KISS & 63; s2 = KISS & 63; d = KISS & 1;
b[i] = (d) ? ((0x1ULL << s1) - (0x1ULL << s2)) :
((0x1ULL << s1) + (0x1ULL << s2));
#endif // PURELY_RANDOM
}
CUDA_SAFE_CALL (cudaMemcpy (d_a, a, sizeof(d_a[0])*LEN,
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL (cudaMemcpy (d_b, b, sizeof(d_b[0])*LEN,
cudaMemcpyHostToDevice));
/* Compute execution configuration */
dim3 dimBlock (THREADS);
int threadBlocks = (LEN + (dimBlock.x - 1)) / dimBlock.x;
if (threadBlocks > 65520) threadBlocks = 65520;
dim3 dimGrid(threadBlocks);
mul64wide_test<<<dimGrid,dimBlock>>>(d_a, d_b, d_r, LEN);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaMemcpy (r, d_r, sizeof(r[0]) * LEN,
cudaMemcpyDeviceToHost));
for (int i = 0; i < LEN; i++) {
ulonglong2 ref = umul64wide (a[i], b[i]);
ulonglong2 res = r[i];
if ((res.x != ref.x) || (res.y != ref.y)) {
printf ("!!!! a=%016llx b=%016llx res=%016llx%016llx ref=%016llx%016llx\n", a[i], b[i], res.y, res.x, ref.y, ref.x);
}
}
}
printf ("\n");
CUDA_SAFE_CALL (cudaFree(d_a));
CUDA_SAFE_CALL (cudaFree(d_b));
CUDA_SAFE_CALL (cudaFree(d_r));
CUDA_SAFE_CALL (cudaDeviceSynchronize());
cudaProfilerStop();
free (a);
free (b);
free (r);
return EXIT_SUCCESS;
}
```