I have some problem with shared memory in my program.
After for cycle values in shared memory are different every time.
If I use only one block in grid - result is correct.
If I use more then one block - result is invalid.
Shared memory variable written in global space.
Hi,
can you give some more details about your problem? Perhaps a code snipped?
I delete some unused code from listing.
I detect problem in process() function (when for cycles is work).
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime_api.h>
#define CUDA_CHECK_ERROR(err) \
if (err != cudaSuccess) { \
printf("Error in file: %s, line: %s\n", __FILE__, __LINE__); \
printf(cudaGetErrorString(err)); \
printf("\n"); \
} \
constant unsigned int md5constant[64] =
{
0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
};
shared unsigned int partialHash[4];
shared unsigned int waitingData[16];
shared unsigned int bitTotal[2];
shared unsigned char hashSum[16];
shared unsigned int msg[32];
//
//
//
global void get_md5_constant(unsigned int* nums);
device unsigned int shiftLeft(unsigned int val, unsigned int step);
global void get_md5_constant(unsigned int* nums)
{
nums[threadIdx.x] = md5constant[threadIdx.x];
};
device void reset()
{
for (int i = 0; i < 16; i++)
{
waitingData[i] = 0;
}
__syncthreads();
for (int i = 0; i < 2; i++)
{
bitTotal[i] = 0;
}
__syncthreads();
partialHash[0] = 0x67452301;
partialHash[1] = 0xefcdab89;
__syncthreads();
partialHash[2] = ~partialHash[0];
partialHash[3] = ~partialHash[1];
__syncthreads();
};
device void process()
{
unsigned int a = partialHash[0];
unsigned int b = partialHash[1];
unsigned int c = partialHash[2];
unsigned int d = partialHash[3];
__syncthreads();
unsigned int dataValue = 0;
for (int i = 0; i < 16; i++)
{
dataValue = waitingData[i];
waitingData[i] = 0;
msg[i] = msg[i + 16] = dataValue;
}
__syncthreads();
int index = 0;
for (int i = -16; i != 0; i += 4)
{
a += msg[i + 16] + md5constant[index] + (d ^ (b & (c ^ d)));
a = b + shiftLeft(a, 7);
index++;
d += msg[i + 17] + md5constant[index] + (c ^ (a & (b ^ c)));
d = a + shiftLeft(d, 12);
index++;
c += msg[i + 18] + md5constant[index] + (b ^ (d & (a ^ B)));
c = d + shiftLeft(c, 17);
index++;
b += msg[i + 19] + md5constant[index] + (a ^ (c & (d ^ a)));
b = c + shiftLeft(b, 22);
index++;
}
__syncthreads();
for (int i = -16; i != 0; i += 4)
{
a += msg[i + 17] + md5constant[index] + (c ^ (d & (b ^ c)));
a = b + shiftLeft(a, 5);
index++;
d += msg[i + 22] + md5constant[index] + (b ^ (c & (a ^ B)));
d = a + shiftLeft(d, 9);
index++;
c += msg[i + 27] + md5constant[index] + (a ^ (b & (d ^ a)));
c = d + shiftLeft(c, 14);
index++;
b += msg[i + 16] + md5constant[index] + (d ^ (a & (c ^ d)));
b = c + shiftLeft(b, 20);
index++;
}
__syncthreads();
for (int i = 16; i != 0; i -= 4)
{
a += msg[i + 5] + md5constant[index] + ((b ^ c) ^ d);
a = b + shiftLeft(a, 4);
index++;
d += msg[i + 8] + md5constant[index] + (a ^ (b ^ c));
d = a + shiftLeft(d, 11);
index++;
c += msg[i + 11] + md5constant[index] + ((d ^ a) ^ B);
c = d + shiftLeft(c, 16);
index++;
b += msg[i + 14] + md5constant[index] + (c ^ (d ^ a));
b = c + shiftLeft(b, 23);
index++;
}
__syncthreads();
for (int i = 16; i != 0; i -= 4)
{
a += msg[i] + md5constant[index] + (c ^ (~d | B));
a = b + shiftLeft(a, 6);
index++;
d += msg[i + 7] + md5constant[index] + (b ^ (~c | a));
d = a + shiftLeft(d, 10);
index++;
c += msg[i + 14] + md5constant[index] + (a ^ (~b | d));
c = d + shiftLeft(c, 15);
index++;
b += msg[i + 5] + md5constant[index] + (d ^ (~a | c));
b = c + shiftLeft(b, 21);
index++;
}
__syncthreads();
partialHash[0] = partialHash[0] + a;
partialHash[1] = partialHash[1] + b;
partialHash[2] = partialHash[2] + c;
partialHash[3] = partialHash[3] + d;
};
device void final(unsigned char* B)
{
unsigned bit0 = bitTotal[0];
unsigned bit1 = bitTotal[1];
unsigned occupied = bit0 & 511;
waitingData[occupied / 32] = 0x80 << (int)(occupied & 31);
occupied += 8;
for (int i = 0; i < 4; i++)
{
unsigned int hashElement = partialHash[i];
hashSum[4 * i] = (unsigned char)((hashElement) & 255);
hashSum[4 * i + 1] = (unsigned char)((hashElement >> 8) & 255);
hashSum[4 * i + 2] = (unsigned char)((hashElement >> 16) & 255);
hashSum[4 * i + 3] = (unsigned char)((hashElement >> 24) & 255);
}
__syncthreads();
for (int i = 0; i < 16; i++)
{
b[i] = hashSum[i];
}
__syncthreads();
};
device unsigned int shiftLeft(unsigned int val, unsigned int step)
{
unsigned int overflowBits = val >> (32 - step);
val <<= step;
val |= overflowBits;
return val;
};
global void computeMD5Gpu(unsigned char* bytes)
{
reset();
process();
final(bytes);
};
host void ts()
{
unsigned char hash[17];
memset(hash, 0, sizeof(hash));
unsigned char* devHash;
cudaMalloc((void**)&devHash, sizeof(hash));
dim3 grid = dim3(1, 1, 1);
dim3 block = dim3(16, 16, 1);
cudaEvent_t syncEvent;
cudaEventCreate(&syncEvent);
computeMD5Gpu<<<grid, block>>>(devHash);
cudaEventRecord(syncEvent, 0);
cudaEventSynchronize(syncEvent);
cudaMemcpy(hash, devHash, sizeof(hash), cudaMemcpyDeviceToHost);
cudaFree(devHash);
cudaEventDestroy(syncEvent);
for (int i = 0; i < 16; i++)
{
printf("%i\n",hash[i]);
}
printf("\n");
}
host int main()
{
ts();
return 0;
}
[/codebox]
All your threads are doing the same thing. They will interfere with each other when they refer to shared memory or global memory.
Thanks