Shared memory problem

MaxFX · March 31, 2009, 5:43am

I have some problem with shared memory in my program.
After for cycle values in shared memory are different every time.
If I use only one block in grid - result is correct.
If I use more then one block - result is invalid.
Shared memory variable written in global space.

navier-stokes · March 31, 2009, 6:32am

Hi,
can you give some more details about your problem? Perhaps a code snipped?

MaxFX · March 31, 2009, 6:50am

I delete some unused code from listing.

I detect problem in process() function (when for cycles is work).

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime_api.h>

#define CUDA_CHECK_ERROR(err) \

if (err != cudaSuccess) { \

printf("Error in file: %s, line: %s\n", __FILE__, __LINE__); \

printf(cudaGetErrorString(err));	\

printf("\n"); \

} \

constant unsigned int md5constant[64] =

{

0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,

0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,

0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,

0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,

0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,

0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,

0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,

0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,

0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,

0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,

0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,

0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,

0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,

0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,

0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,

0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391

};

shared unsigned int partialHash[4];

shared unsigned int waitingData[16];

shared unsigned int bitTotal[2];

shared unsigned char hashSum[16];

shared unsigned int msg[32];

//

global void get_md5_constant(unsigned int* nums);

device unsigned int shiftLeft(unsigned int val, unsigned int step);

global void get_md5_constant(unsigned int* nums)

{

nums[threadIdx.x] = md5constant[threadIdx.x];

};

device void reset()

{

for (int i = 0; i < 16; i++)

{

	waitingData[i] = 0;

}

__syncthreads();



for (int i = 0; i < 2; i++)

{

	bitTotal[i] = 0;

}

__syncthreads();

partialHash[0] = 0x67452301;

partialHash[1] = 0xefcdab89;

__syncthreads();

partialHash[2] = ~partialHash[0];

partialHash[3] = ~partialHash[1];

__syncthreads();

};

device void process()

{

unsigned int a = partialHash[0];

unsigned int b = partialHash[1];

unsigned int c = partialHash[2];

unsigned int d = partialHash[3];

__syncthreads();

unsigned int dataValue = 0;

for (int i = 0; i < 16; i++)

{

	dataValue = waitingData[i];

	waitingData[i] = 0;

	msg[i] = msg[i + 16] = dataValue;

}

__syncthreads();

int index = 0;

for (int i = -16; i != 0; i += 4)

{

	a += msg[i + 16] + md5constant[index] + (d ^ (b & (c ^ d)));

	a = b + shiftLeft(a, 7);

	index++;

	d += msg[i + 17] + md5constant[index] + (c ^ (a & (b ^ c)));

	d = a + shiftLeft(d, 12);

	index++;

	c += msg[i + 18] + md5constant[index] + (b ^ (d & (a ^ B)));

	c = d + shiftLeft(c, 17);

	index++;

	b += msg[i + 19] + md5constant[index] + (a ^ (c & (d ^ a)));

	b = c + shiftLeft(b, 22);

	index++;

}

__syncthreads();

for (int i = -16; i != 0; i += 4)

{

	a += msg[i + 17] + md5constant[index] + (c ^ (d & (b ^ c)));

	a = b + shiftLeft(a, 5);

	index++;

	d += msg[i + 22] + md5constant[index] + (b ^ (c & (a ^ B)));

	d = a + shiftLeft(d, 9);

	index++;

	c += msg[i + 27] + md5constant[index] + (a ^ (b & (d ^ a)));

	c = d + shiftLeft(c, 14);

	index++;

	b += msg[i + 16] + md5constant[index] + (d ^ (a & (c ^ d)));

	b = c + shiftLeft(b, 20);

	index++;

}

__syncthreads();

for (int i = 16; i != 0; i -= 4)

{

	a += msg[i + 5] + md5constant[index] + ((b ^ c) ^ d);

	a = b + shiftLeft(a, 4);

	index++;

	d += msg[i + 8] + md5constant[index] + (a ^ (b ^ c));

	d = a + shiftLeft(d, 11);

	index++;

	c += msg[i + 11] + md5constant[index] + ((d ^ a) ^ B);

	c = d + shiftLeft(c, 16);

	index++;

	b += msg[i + 14] + md5constant[index] + (c ^ (d ^ a));

	b = c + shiftLeft(b, 23);

	index++;

}

__syncthreads();

for (int i = 16; i != 0; i -= 4)

{

	a += msg[i] + md5constant[index] + (c ^ (~d | B));

	a = b + shiftLeft(a, 6);

	index++;

	d += msg[i + 7] + md5constant[index] + (b ^ (~c | a));

	d = a + shiftLeft(d, 10);

	index++;

	c += msg[i + 14] + md5constant[index] + (a ^ (~b | d));

	c = d + shiftLeft(c, 15);

	index++;

	b += msg[i + 5] + md5constant[index] + (d ^ (~a | c));

	b = c + shiftLeft(b, 21);

	index++;

}

__syncthreads();

partialHash[0] = partialHash[0] + a;

partialHash[1] = partialHash[1] + b;

partialHash[2] = partialHash[2] + c;

partialHash[3] = partialHash[3] + d;

};

device void final(unsigned char* B)

{

unsigned bit0 = bitTotal[0];

unsigned bit1 = bitTotal[1];

unsigned occupied = bit0 & 511;

waitingData[occupied / 32] = 0x80 << (int)(occupied & 31);

occupied += 8;

for (int i = 0; i < 4; i++)

{

	unsigned int hashElement = partialHash[i];

	hashSum[4 * i]		= (unsigned char)((hashElement) & 255);

	hashSum[4 * i + 1]	= (unsigned char)((hashElement >> 8) & 255);

	hashSum[4 * i + 2]	= (unsigned char)((hashElement >> 16) & 255);

	hashSum[4 * i + 3]	= (unsigned char)((hashElement >> 24) & 255);

}

__syncthreads();

for (int i = 0; i < 16; i++)

{

	b[i] = hashSum[i];

}

__syncthreads();

};

device unsigned int shiftLeft(unsigned int val, unsigned int step)

{

unsigned int overflowBits = val >> (32 - step);

val <<= step;

val |= overflowBits;

return val;

};

global void computeMD5Gpu(unsigned char* bytes)

{

reset();

process();

final(bytes);

};

host void ts()

{

unsigned char hash[17];

memset(hash, 0, sizeof(hash));

unsigned char* devHash;

cudaMalloc((void**)&devHash, sizeof(hash));

dim3 grid = dim3(1, 1, 1);

dim3 block = dim3(16, 16, 1);

cudaEvent_t syncEvent;

cudaEventCreate(&syncEvent);

computeMD5Gpu<<<grid, block>>>(devHash);

cudaEventRecord(syncEvent, 0);

cudaEventSynchronize(syncEvent);



cudaMemcpy(hash, devHash, sizeof(hash), cudaMemcpyDeviceToHost);

cudaFree(devHash);

cudaEventDestroy(syncEvent);

for (int i = 0; i < 16; i++)

{

	printf("%i\n",hash[i]);

}

printf("\n");

}

host int main()

{

ts();

return 0;

}

[/codebox]

Jamie_K · March 31, 2009, 11:46pm

All your threads are doing the same thing. They will interfere with each other when they refer to shared memory or global memory.

MaxFX · April 1, 2009, 7:33am

Thanks

Topic		Replies	Views
Global memory access problem CUDA Programming and Performance	9	3469	July 13, 2011
Why cannot run this program CUDA Programming and Performance	7	951	May 11, 2018
Shared memory error CUDA Programming and Performance	1	896	June 24, 2012
Using shared Memory CUDA Programming and Performance	3	4867	March 11, 2012
why result varied based on different number of threads per block? CUDA Programming and Performance	8	1929	March 1, 2011
Annoying problems with memory and/or syntax CUDA Programming and Performance	19	4767	April 8, 2008
Getting wrong output from CUDA kernel CUDA Programming and Performance	6	8280	April 15, 2011
Very strange share memory bank conflicts CUDA Programming and Performance cuda	4	487	November 2, 2021
copy from 1D array to shared memory matrix in cuda CUDA Programming and Performance	7	2053	June 9, 2015
Invalid __global__ write of size 4. Need help with debugging CUDA Programming and Performance cuda	3	927	October 12, 2021

Shared memory problem

Related topics