Hi guys, my first post here. I’ve tried a few approaches, but I can’t seem to get the right way to use shared in this specific code.
I’ve succesfully used shared variables in other parts and codes, but on this one the results don’t match and I couldn’t figure out why.
The code is a algo calculation. I noticed there was some room for parallel computing for the W variable below. So I created a shared s_W with the same size, for threads to cooperate (each one computes a part in parallel) and, after que computation, the W local thread variable would receive the full computed values.
I thought it would work, because W is a thread local variable, but I can’t seem to figure out why the results don’t match.
The original code just has W instead of s_W and, of course, no (if thr ==) parts.
I hope someone can enlighten me :)
Thanks,
Rod.
uint32_t W[64];
__shared__ uint32_t s_W[64];
#pragma unroll 4
for (int i = 0; i < 4; i++)
{
__syncthreads();
int thr = threadIdx.x;
if (thr == 0)
{
uint32_t a = P[i];
uint32_t b = P[i + 4];
uint32_t c = h[i + 8];
uint32_t d = P[i + 8];
uint32_t ab = a ^ b;
uint32_t bc = b ^ c;
uint32_t cd = c ^ d;
uint32_t t = (ab & 0x80808080);
uint32_t t2 = (bc & 0x80808080);
uint32_t t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
s_W[0 + i] = abx ^ bc ^ d;
s_W[0 + i + 4] = bcx ^ a ^ cd;
s_W[0 + i + 8] = cdx ^ ab ^ d;
s_W[0 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
}
if (thr == 1)
{
uint32_t a = P[12 + i];
uint32_t b = h[i + 4];
uint32_t c = P[12 + i + 4];
uint32_t d = P[12 + i + 8];
uint32_t ab = a ^ b;
uint32_t bc = b ^ c;
uint32_t cd = c ^ d;
uint32_t t = (ab & 0x80808080);
uint32_t t2 = (bc & 0x80808080);
uint32_t t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
s_W[16 + i] = abx ^ bc ^ d;
s_W[16 + i + 4] = bcx ^ a ^ cd;
s_W[16 + i + 8] = cdx ^ ab ^ d;
s_W[16 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
}
if (thr == 2)
{
uint32_t a = h[i];
uint32_t b = P[24 + i + 0];
uint32_t c = P[24 + i + 4];
uint32_t d = P[24 + i + 8];
uint32_t ab = a ^ b;
uint32_t bc = b ^ c;
uint32_t cd = c ^ d;
uint32_t t = (ab & 0x80808080);
uint32_t t2 = (bc & 0x80808080);
uint32_t t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
s_W[32 + i] = abx ^ bc ^ d;
s_W[32 + i + 4] = bcx ^ a ^ cd;
s_W[32 + i + 8] = cdx ^ ab ^ d;
s_W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
}
if (thr == 3)
{
uint32_t a = P[36 + i];
uint32_t b = P[36 + i + 4];
uint32_t c = P[36 + i + 8];
uint32_t d = h[i + 12];
uint32_t ab = a ^ b;
uint32_t bc = b ^ c;
uint32_t cd = c ^ d;
uint32_t t = (ab & 0x80808080);
uint32_t t2 = (bc & 0x80808080);
uint32_t t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
s_W[48 + i] = abx ^ bc ^ d;
s_W[48 + i + 4] = bcx ^ a ^ cd;
s_W[48 + i + 8] = cdx ^ ab ^ d;
s_W[48 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
}
__syncthreads();
}
__syncthreads();
#pragma unroll 64
for (int j = 0; j < 64; j++)
W[j] = s_W[j];