Can some one check and confirm that this kernel does what i think it does? Im having trouble getting to grips with threads…
so:
-
in main function, a block of threads is created with dimensions 16x1
-
a 1x1 grid is then made
then in testThreads function:
make i = x data of the thread block and thread id
make j the same but for y dimensions
index is then identifyer the different threads
then if (i <N && j <1)
induvidual thread does the maths for each value in the array
back to main function, memcopy to host
finish.
I think this is correct however i am getting an odd error with the output here it is:
0; 12; 7; 16; 13; 29; 18; 15; 19; 22; 27; 9; 11; 23; 17; 21;
i dont know why the very first value is not computed? not even set to 5 in the testThreads function? why is that? im guessing something im doing with the threads is making it do that?
Thanks in advance for any help
heres the code…
#include <stdio.h>
#include <cutil.h>
#define OUTFILE "output.txt"
#define blocksize 16
#define N 16
__global__ void testThreads(unsigned int deviceBuff[]);
__constant__ unsigned int numbers[16] = {3,7,2,11,8,24,13,10,14,17,22,4,6,18,12,16};
__global__ void testThreads(unsigned int deviceBuff[]){
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
int index = i+j*N;
if (i<N && j< 1){
deviceBuff[index] = 5;
deviceBuff[index] = deviceBuff[index]+numbers[index];
}
}
int main (int argc, char** argv){
unsigned int * hostBuff;
unsigned int * deviceBuff;
hostBuff = (unsigned int*) malloc(sizeof(unsigned int)*N);
printf("\nAllocated %d host memory bytes...", sizeof(unsigned int)*N);
CUDA_SAFE_CALL(cudaMalloc((void**)&deviceBuff, sizeof(unsigned int)*N));
printf("\nAllocated %d device memory bytes...", sizeof(unsigned int)*N);
dim3 dimBlock (blocksize, 1);
dim3 dimGrid (N/dimBlock.x,1);
testThreads<<<dimGrid,dimBlock>>>(deviceBuff);
CUDA_SAFE_CALL(cudaMemcpy(hostBuff, deviceBuff, sizeof(unsigned int)*N, cudaMemcpyDeviceToHost));
printf("\nCopied %d device memory bytes to host...", sizeof(unsigned int)*N);
free(hostBuff);
cudaFree(deviceBuff);
printf("\nFreed memory on host and device...");
FILE* output;
output=fopen(OUTFILE, "w");
for (int x = 0; x< N; x++){
fprintf(output, "%d; ", hostBuff[x]);
}
fclose(output);
printf("\nPrinted output to file...");
printf("EXIT");
return 0;
}