Hi everyone;
I wrote a code to calculate summation of array parts. Code works, but looks like there is a limitation which I did not know.
Firstly, the code is follows:
[codebox]#include <stdio.h>
#include <time.h>
#include <cuda.h>
#ifdef DEVICE_EMULATION
#define EMUSYNC __syncthreads()
#define EMUSYNC
global void SumArray(float *data, float *odata, unsigned int n)
{
unsigned int VirtualBlockSize = n/gridDim.x;
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * VirtualBlockSize + tid;
unsigned int finish = (blockIdx.x + 1) * VirtualBlockSize;
unsigned int step = blockDim.x;
unsigned int blockSize = blockDim.x;
shared float sdata[256];
sdata[tid] = 0;
while(i<finish)
{
sdata[tid] += data[i];
i += step;
}
__syncthreads();
// do reduction in shared mem
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
#ifndef DEVICE_EMULATION
if (tid < 32)
{
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; EMUSYNC; }
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; EMUSYNC; }
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; EMUSYNC; }
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; EMUSYNC; }
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; EMUSYNC; }
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; EMUSYNC; }
}
// write result for this block to global mem
if (tid == 0) odata[blockIdx.x] = sdata[0];
}
int main()
{
float *aD, *cD, *aH, *cH, deger=0;
const unsigned int N = 100*1024*1024;
const unsigned int memSize = sizeof(float) * N;
const unsigned int NumberOfElements = 1024;
const unsigned int CmemSize = sizeof(float) * NumberOfElements;
aH = (float*) malloc(memSize);
cH = (float*) malloc(CmemSize);
for (unsigned int i = 0; i< N; i++)
{
if (i%1024 == 0)
deger += 1.0f;
aH[i] = deger;
if (i<NumberOfElements)
cH[i] = 0.0f;
}
printf("\n aH[0] = %f \n", aH[0]);
printf("\n aH[1024] = %f \n", aH[1024]);
printf("\n aH[2048] = %f \n", aH[2048]);
printf("\n aH[3072] = %f \n", aH[3072]);
const unsigned int n_threadsx = 256;
const unsigned int n_threadsy = 1;
const unsigned int n_blocks = NumberOfElements;
printf("\n number of blocks = %d \n", n_blocks);
dim3 dimBlocks (n_threadsx, n_threadsy);
dim3 dimGrid (n_blocks, 1);
printf(“\n Grid Dim :%d \t %d\t %d\n”, dimGrid.x ,dimGrid.y, dimGrid.z);
printf("\n Block Dim :%d \t %d\t %d\n", dimBlocks.x , dimBlocks.y, dimBlocks.z);
cudaMalloc((void**) &aD, memSize);
cudaMalloc((void**) &cD, CmemSize);
cudaMemcpy(aD, aH, memSize, cudaMemcpyHostToDevice);
cudaMemcpy(cD, cH, CmemSize, cudaMemcpyHostToDevice);
SumArray <<<dimGrid, dimBlocks>>> (aD, cD, N);
cudaError_t err = cudaGetLastError();
fprintf(stderr, "Cuda error: : %s.\n", cudaGetErrorString( err) );
cudaMemcpy(cH, cD, CmemSize, cudaMemcpyDeviceToHost);
fprintf(stderr, "Cuda error: : %s.\n", cudaGetErrorString( err) );
for(unsigned int i = 0; i < n_blocks; i++)
printf("\ncH[%d] = %f\n", i, cH[i]);
free(aH);
free(cH);
cudaFree(aD);
cudaFree(cD);
return 0;
}[/codebox]
As you can see, the array divided 1024 parts and each parts are calculated by the code. Avoiding of the gridDim.x * maxthread limitation
(65535 * max 512 threads) each thread has multiple adds. Everything is fine until N = 30010241024 = 314572800. If I take N as
40010241024 = 419430400, the code returns error “invalid device pointer”. I changed some variables from int to long and took all
the the elements of array as 1, nothing changed, same error.
Any suggestions, thanks…