Hi,
The following program simply generates n numbers on host , copy it to the device and then again copy back to the host and prints the numbers. Basically I need it somewhere in a code that I am writing. My requirement is that I must be able to generate and pass more than 1024 elements. In the code below since I am loading two elements per thread so keeping in mind maximum threads per block limitation of 512, I can access not more than 1024 elements, as one thread is loading two elements. So I introduced multiple blocks by using the following statements
int thid = blockIdx.x * blockDim.x + threadIdx.x;
The program is working fine. But for debugging purpose I want to know which block of thread is actually working (that is copying data) on which data set. For example suppose we generate 8 numbers
g_idata = [1 2 3 4 5 6 7 8 ]
And two blocks, B0 and B1.
Now, once I run this program I want the output something like this:
[b]
I am Block B0. I am operating on the data set [1 2 3 4]
I am Block B1. I am operating on the data set [5 6 7 8 ]
[/b]
In short I want to know data corresponding to each block. I would be grateful to you guys for helping me here.
Thanks for your precious time.
__global__ void modified(float *out, float *g_idata, float *g_blockSums, unsigned int n)
{
extern __shared__ float temp[];// allocated on invocation
int thid = blockIdx.x * blockDim.x + threadIdx.x;
int ai = thid;
int bi = thid + n/2;
temp[ai] = g_idata[ai];
temp[bi] = g_idata[bi];
out[ai] = temp[ai];
out[bi] = temp[bi];
}
int main()
{
unsigned int num_Elements, blockSize; //Number of data elements
printf("Please enter the number of elements");
scanf("%d", &num_Elements);
printf("Please enter the Block Size");
scanf("%d", &blockSize);
size_t size=sizeof(float)*num_Elements;//Size Of Memory
size_t sizeBlockSums=sizeof(float)*1;
float *out, *g_idata, *m_odata, *m_idata, *g_blockSums, *m_blockSums;//Declaration of Host and Device Pointer
m_idata=(float*)malloc(size);//MemoryAllocation On Host
m_odata=(float*)malloc(size);//Same above
m_blockSums=(float*)malloc(sizeBlockSums);
cudaMalloc((void**)&g_idata,size);//Memory Allocation On Device
cudaMalloc((void**)&out,size);//Same Above
// g-->global memory, m--> main memory
//Intialization of Elements
for (int i = 0; i < num_Elements; i++)
{ m_idata[i] = i+1;
}
cudaMemcpy(g_idata,m_idata,size,cudaMemcpyHostToDevice);//copy the data m_idata from main memory into g_idata in device using cudaMemcpy
int nBlocks=num_Elements/blockSize; //Number of Blocks
unsigned int nThreads=blockSize/2; //blockDimx.x, number of threads, /2 as one thread is loading two elements
printf("\nNumber of blocks is %d\n and number of threads per block(Block Dimesion) is %d\n", nBlocks, nThreads);
modified<<<nBlocks, nThreads ,size >>>(out, g_idata, g_blockSums, num_Elements);//Call the kernel
cudaMemcpy(m_odata,out ,size,cudaMemcpyDeviceToHost);//copy the data g_odata from Device memory into m_odata in Global Memory
for(unsigned int i=0;i<num_Elements;i++)
{
printf("m_odata[%d] = %f\n",i+1, m_odata[i]);
}
//Free Memory of Host and Device
cudaFree(g_idata);
cudaFree(out);
cudaFree(g_blockSums);
free(m_idata);
free(m_blockSums);
free(m_odata);
}