wierd problem while-loop

Hello

i have some wierd problem with a while loop in my cuda code:

int x = blockDim.x * blockIdx.x;

while (x < arraysize){

	 x+=threadIdx.x;

	 if (x < arraysize){

		  for (int i=0;i < vectorsize ...)

			sharedmem[threadIdx.x] = globalarray[x]; //each thread reads a vector to shared memory from global memory

	}

	if (threadIdx.x < vectorsize){

	  for (i=0;i < numvectors){

	  //each thread adds a component of vector i and store the result back to some other shared memory

   }

   iteration++;

   x = blockDim.x * blockIdx.x + gridDim.x * blockDim.x *  iteration;

}

... some other code

kernel launched with 128 threads but inside the while loop not all values for x get processed :wacko: all values from 0 to 31 get processed, 32 not anymore, some other values get processed from other blocks and some get processed more than once :blink: . If i replace the while() with an if() then values from 0 to 128*nblocks get processed and it seems to work fine except that each block should do this processing more than once? Anyone an idea how this is possible or what could be wrong with the code?

Thnx

Tom

Your code looks wrong for various reasons…

For example: Look @ this

for (int i=0;i < vectorsize ...)

			sharedmem[threadIdx.x] = globalarray[x]; //each thread reads a vector to shared memory from global memory

sharedmem[threadIdx.x] remains same for each iteration of FOR loop AND “globalarray” also remains the same for each iteration of FOR loop…

Then, why run the FOR loop itself in 1st place…

OK, Here in I give an example to memset an array of GPU memory. May b, that can help u in structuring your code better.

for(int i=blockIdx.x*blockDim.x + threadIDx.x; i<N; i += blockDim.x*gridDim.x)

{

	array[i] = MEMSET_VALUE;

}

This code will set each element.

i know that (i forgot to type that here) but i don’t think that’s the problem because if i remove the while-loop with an IF statement and just process the data once and generate more blocks so all the threads process only 1 array item then the code works fine. The only problem is that all these blocks generate a lot of data (for ex. having 96 blocks will write 96 result tables) this makes some overhead for the next kernel which will need to process 96 tables. So with the while-loop i only need to create as much blocks as there are multiprocessors and each thread should process multiple array elements.

this wont work in my case because inside the loop i need some of the threads after the memory reading to process the data.

This is what he exact code looks like now:

extern __shared__ char smem[];

	float * centroidTable = (float *) & smem[IMUL(IMUL(sizeof(float) , vectorSizeY) + sizeof(int) , blockDim.x) + IMUL(IMUL(sizeof(float), vectorSizeY) + sizeof(int) , threadIdx.x)];

	float * vectorTable;

	int   * clusterIdTable;

	for (int i=0;i<=vectorSizeY;i++) centroidTable[i]=0.0; //reset centroid table & count table on the first loop

	

	int iteration =0;

	int x = IMUL(blockDim.x , blockIdx.x); //the starting x position of this block

	while (x < totalProperties){

		//each thread reads a vector from devProperties memory and puts this in shared memory then thread 0 (+some other threads) cacl the sum

		vectorTable   = (float *) & smem[IMUL(IMUL(sizeof(float) , vectorSizeY) , threadIdx.x)];

		clusterIdTable  = (int *) & smem[IMUL(IMUL(sizeof(float) , vectorSizeY) , blockDim.x ) + IMUL(sizeof(int) , threadIdx.x)];

		centroidTable = (float *) & smem[IMUL(IMUL(sizeof(float) , vectorSizeY) + sizeof(int) , blockDim.x) + IMUL(IMUL(sizeof(float), vectorSizeY) + sizeof(int) , threadIdx.x)];

		

		int numVecRead = totalProperties - x > blockDim.x ? blockDim.x : totalProperties - x; //holds how many vectors will be read by this block this time

		

		x+=threadIdx.x;

		if (x < totalProperties){

			for (unsigned char propId=0;propId<vectorSizeY;propId++){//each thread loads a vector

					vectorTable[propId] =devProperties[x+IMUL(propId , propertiesPitch)]; //coalesced input 1 vector to shared memory (rows are the vector comp)

			}

			clusterIdTable[0]=clusterIds[x]; 

		}

		

		__syncthreads();

		if (threadIdx.x < vectorSizeY){ //each thread needs to calc the sum of 1 property instead of thread 0 calc all sums (this saves 40ms on a 1024x512 image )

			vectorTable = (float *) &   smem[0];																	//make the vectorTable point to the first vector (threadIdx.x = 0)

			clusterIdTable = (int *) &  smem[IMUL(IMUL(sizeof(float) , vectorSizeY) , blockDim.x )];				//make the clusterIdTable point to the first clusterId

			centroidTable = (float *) & smem[IMUL(IMUL(sizeof(float) , vectorSizeY) + sizeof(int) , blockDim.x)];   //make point to centroidTable index 0

			

			for (short vecNr=0;vecNr<numVecRead;vecNr++){

				int centroidTableIdx = IMUL(clusterIdTable[vecNr] , (vectorSizeY+1)); //calculate the index in the centroid table based on cluster id

				centroidTable[centroidTableIdx + threadIdx.x] += vectorTable[IMUL(vecNr,vectorSizeY) + threadIdx.x];

				if (threadIdx.x==0) (*((int *) & centroidTable[centroidTableIdx+vectorSizeY]))++; //increase the vector count (only for thread0) note: this is an integer stored in a float table!

				__syncthreads(); //sync and process next vector

			}

		}

		iteration++;

		x = IMUL(blockDim.x, blockIdx.x) + IMUL(IMUL(gridDim.x,blockDim.x),iteration); //calc the block x

	}

	__syncthreads();

	

	centroidTable = (float *) & smem[IMUL(IMUL(sizeof(float) , vectorSizeY) + sizeof(int) , blockDim.x) + IMUL(IMUL(sizeof(float), vectorSizeY) + sizeof(int) , threadIdx.x)]; //make point to the right centroid table

	//x =;

	if (threadIdx.x < totalClusters){ //some threads write the 'result table' which holds the sum & count for each cluster

		for (unsigned char j=0;j<=vectorSizeY;j++){

			//centroids[IMUL(IMUL(gridDim.x , totalClusters) , j) + IMUL(blockIdx.x , totalClusters)+x]=centroidTable[j]; //write memory (coalseced)

			centroids[IMUL(totalCentroidPitch,j) + IMUL(blockIdx.x,centroidPitch) + threadIdx.x + centroidPitch]=centroidTable[j];

		}

		//x+=blockDim.x;

	}