Summation of Big Array Parts Questions...

Hi everyone;

I wrote a code to calculate summation of array parts. Code works, but looks like there is a limitation which I did not know.

Firstly, the code is follows:

[codebox]#include <stdio.h>

#include <time.h>

#include <cuda.h>

#ifdef DEVICE_EMULATION

#define EMUSYNC __syncthreads()

#else

#define EMUSYNC

#endif

global void SumArray(float *data, float *odata, unsigned int n)

{

unsigned int VirtualBlockSize = n/gridDim.x;

unsigned int tid = threadIdx.x;

unsigned int i = blockIdx.x * VirtualBlockSize + tid;

unsigned int finish = (blockIdx.x + 1) * VirtualBlockSize;

unsigned int step = blockDim.x;

unsigned int blockSize = blockDim.x;

shared float sdata[256];

sdata[tid] = 0;

while(i<finish)

{

  sdata[tid] += data[i];

  i += step;

}

__syncthreads();

// do reduction in shared mem

if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }

if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }

if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }

#ifndef DEVICE_EMULATION

if (tid < 32)

#endif

{

  if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; EMUSYNC; }

  if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; EMUSYNC; }

  if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; EMUSYNC; }

  if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; EMUSYNC; }

  if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; EMUSYNC; }

  if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; EMUSYNC; }

}

// write result for this block to global mem

if (tid == 0) odata[blockIdx.x] = sdata[0];

}

int main()

{

float *aD, *cD, *aH, *cH, deger=0;

const unsigned int N = 100*1024*1024;

const unsigned int memSize = sizeof(float) * N;

const unsigned int NumberOfElements = 1024;

const unsigned int CmemSize = sizeof(float) * NumberOfElements;

aH = (float*) malloc(memSize);

cH = (float*) malloc(CmemSize);

for (unsigned int i = 0; i< N; i++)

{

   if (i%1024 == 0)

      deger += 1.0f;

   aH[i] = deger;

   if (i<NumberOfElements)

      cH[i] = 0.0f;

}

printf("\n  aH[0] = %f \n", aH[0]);

printf("\n  aH[1024] = %f \n", aH[1024]);

printf("\n  aH[2048] = %f \n", aH[2048]);

printf("\n  aH[3072] = %f \n", aH[3072]);

const unsigned int n_threadsx = 256;

const unsigned int n_threadsy   = 1;

const unsigned int n_blocks     = NumberOfElements;

printf("\n number of blocks = %d \n", n_blocks);

dim3 dimBlocks (n_threadsx, n_threadsy);

dim3 dimGrid (n_blocks, 1);

printf(“\n Grid Dim :%d \t %d\t %d\n”, dimGrid.x ,dimGrid.y, dimGrid.z);

printf("\n Block Dim :%d \t %d\t %d\n", dimBlocks.x , dimBlocks.y, dimBlocks.z);

cudaMalloc((void**) &aD, memSize);

cudaMalloc((void**) &cD, CmemSize);

cudaMemcpy(aD, aH, memSize, cudaMemcpyHostToDevice);

cudaMemcpy(cD, cH, CmemSize, cudaMemcpyHostToDevice);

SumArray <<<dimGrid, dimBlocks>>> (aD, cD, N);

cudaError_t err = cudaGetLastError();

fprintf(stderr, "Cuda error: : %s.\n", cudaGetErrorString( err) );

cudaMemcpy(cH, cD, CmemSize, cudaMemcpyDeviceToHost);

fprintf(stderr, "Cuda error: : %s.\n", cudaGetErrorString( err) );

for(unsigned int i = 0; i < n_blocks; i++)

 printf("\ncH[%d] = %f\n", i, cH[i]);

free(aH);

free(cH);

cudaFree(aD);

cudaFree(cD);

return 0;

}[/codebox]

As you can see, the array divided 1024 parts and each parts are calculated by the code. Avoiding of the gridDim.x * maxthread limitation

(65535 * max 512 threads) each thread has multiple adds. Everything is fine until N = 30010241024 = 314572800. If I take N as

40010241024 = 419430400, the code returns error “invalid device pointer”. I changed some variables from int to long and took all

the the elements of array as 1, nothing changed, same error.

Any suggestions, thanks…

Sorry for silly question. I should check the global memory of my card. I made a mistake during
allocation limit. So there is no problem, just over limit of global memory.

Thanks for reading…