cudaMemcopy bug

Hello,

I have a problem with simple code. I managed to find that program crashes at the cudamemcopy. I use 2D matrices. It works for size smaller than 9600x9600, and it crashes for larger sizes. If I remove the cudamemcopy line it works without problem. The size of the matrix is about 1.7 GB.

  • Operating System: RHEL 5.7
  • System description: Intel Xeon X5650 CPUs with six cores each, 24 GB RAM, NVIDIA Tesla M2070

I only get this error message from the queuing system:

    Loading CUDA version 4.0.17

    Note: Programming environment is now PrgEnv-gnu

Command terminated by signal 11
srun: error: g6: task 0: Exited with exit code 11
srun: Terminating job step 749571.0
0.00user 0.00system 0:00.01elapsed 18%CPU (0avgtext+0avgdata 7936maxresident)k
0inputs+0outputs (2major+545minor)pagefaults 0swaps
slurmd[g6]: *** STEP 749571.0 KILLED AT 2011-09-19T10:53:59 WITH SIGNAL 9 ***

This is the code I used and the header for the errors is included:

#include
#include
#include
#include “error_checks.h” // Macros CUDA_CHECK and CHECK_ERROR_MSG
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>

int main(void)
{

cudaDeviceProp  prop;

int Gcount;
CUDA_CHECK( cudaGetDeviceCount( &Gcount ) );
for (int i=0; i< Gcount; i++) {
    CUDA_CHECK( cudaGetDeviceProperties( &prop, i ) );
    printf( "   --- General Information for device %d ---\n", i );
    printf( "Name:  %s\n", prop.name );
    printf( "Compute capability:  %d.%d\n", prop.major, prop.minor );
    printf( "Clock rate:  %d\n", prop.clockRate );
    printf( "Device copy overlap:  " );
    if (prop.deviceOverlap)
        printf( "Enabled\n" );
    else
        printf( "Disabled\n");
    printf( "Kernel execution timeout :  " );
    if (prop.kernelExecTimeoutEnabled)
        printf( "Enabled\n" );
    else
        printf( "Disabled\n" );

    printf( "   --- Memory Information for device %d ---\n", i );
    printf( "Total global mem:  %ld\n", prop.totalGlobalMem );
    printf( "Total constant Mem:  %ld\n", prop.totalConstMem );
    printf( "Max mem pitch:  %ld\n", prop.memPitch );
    printf( "Texture Alignment:  %ld\n", prop.textureAlignment );

    printf( "   --- MP Information for device %d ---\n", i );
    printf( "Multiprocessor count:  %d\n",
                prop.multiProcessorCount );
    printf( "Shared mem per mp:  %ld\n", prop.sharedMemPerBlock );
    printf( "Registers per mp:  %d\n", prop.regsPerBlock );
    printf( "Threads in warp:  %d\n", prop.warpSize );
    printf( "Max threads per block:  %d\n",
                prop.maxThreadsPerBlock );
    printf( "Max thread dimensions:  (%d, %d, %d)\n",
                prop.maxThreadsDim[0], prop.maxThreadsDim[1],
                prop.maxThreadsDim[2] );
    printf( "Max grid dimensions:  (%d, %d, %d)\n",
                prop.maxGridSize[0], prop.maxGridSize[1],
                prop.maxGridSize[2] );
    printf( "\n" );
}

size_t free, total;
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB at the beginning\n”,free/1024,total/1024);

clock_t start, end;
double cpu_time_used;

start = clock();

const int lcellx=16;
const int ncellx=600; //,ncelly=ncellx,ncellz=ncellx;
const int lx=ncellxlcellx,ly=lx;
const int totsize=lx
ly,totsize_pad=lx2(ly/2+1),totsize_invspa=lx*(ly/2+1); const double Pi=acos(-1.0);
const double dt=0.5;
const double at=7.255197456936871;
const double qt=2acos(-1.0)/at;
const double dx=at/((double)lcellx),dy=dx
sqrt(3.0)/2.0;
const double r=-0.25,pm=-0.25;
int nend=6000;
int nsteps=2;
double kx,ky;
double ene,ppmm;

double *dqq;
cufftDoubleReal *dpsi,*dbff;
cufftHandle prc,pcr;

cufftDoubleReal hbff[totsize_pad],hpsi[totsize_pad];
double hqq[totsize_invspa];

printf(“\n”);
printf(" %d KB memory which will be allocated direct by the user on the device. lx= %d ly= %d\n",(2*totsize_pad+totsize_invspa)*sizeof(double)/1024,lx,ly);
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB before the cufft plans are made .\n”,free/1024,total/1024);

cufftPlan2d(&prc,lx,ly,CUFFT_D2Z);
cufftPlan2d(&pcr,lx,ly,CUFFT_Z2D);
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB after the cufft plans are made.\n”,free/1024,total/1024);

dim3 grid,threads;
threads.x=1024;
threads.y=1;
threads.z=1;
grid.x=(int)ceil((double)lx/sqrt((double)threads.x));
grid.y=(int)ceil((double)2*(ly/2+1)/sqrt((double)threads.x));
grid.z=1;

printf(“\n”);
printf(“%d %d %d %d\n”,grid.x,grid.y,threads.xgrid.xgrid.y,totsize_pad);
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB before allocations.\n”,free/1024,total/1024);
int count;
count=0;
ppmm=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{

hpsi[count]=-0.1*(cos(qt*i*dx)*cos(qt*j*dy/sqrt(3.0f))-0.5*cos(2*qt*j*dy/sqrt(3.0f)))+pm;   
ppmm=ppmm+hpsi[count];
}
else
{
hpsi[count]=0;
}
count=count+1;  
}}  
printf("%66.60lf\n", ppmm/(double)totsize);
 
count=0;
for(int i=0;i<lx;i++)
{   
if(i<=lx/2)
{kx=i*2*Pi/((double)lx*dx);}
else   
{kx=(i-lx)*2*Pi/((double)lx*dx);}   

for(int j=0;j<=ly/2;j++)
{
ky=j*2*Pi/((double)ly*dy);

hqq[count]=-(kx*kx+ky*ky);

count=count+1;
}}

CUDA_CHECK( cudaMalloc((void**)&dpsi, sizeof(cufftDoubleReal)totsize_pad) );
CUDA_CHECK( cudaMalloc((void
*)&dbff, sizeof(cufftDoubleReal)totsize_pad) );
CUDA_CHECK( cudaMalloc((void
*)&dqq, sizeof(double)*totsize_invspa) );

printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB after allocations.\n”,free/1024,total/1024);

cudaThreadSynchronize();
CUDA_CHECK( cudaMemcpy(dqq, hqq, sizeof(double)*totsize_invspa,cudaMemcpyHostToDevice) );
cudaThreadSynchronize();
CUDA_CHECK( cudaMemcpy(dpsi, hpsi, sizeof(cufftDoubleReal)*totsize_pad,cudaMemcpyHostToDevice) );
cudaThreadSynchronize();


end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("%66.60lf \n",cpu_time_used);


CUDA_CHECK( cudaFree((void*)dpsi) );
CUDA_CHECK( cudaFree((void*)dbff) );
CUDA_CHECK( cudaFree((void*)dqq) );
cufftDestroy(prc);
cufftDestroy(pcr);

printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB at the end.\n”,free/1024,total/1024);

return 0;

}
error_checks.h (854 Bytes)

It seems that I found the answer to my own question. There is some problem with large transfers. The solution was to declare the host arrays as static. This solved the problem, but I have no idea why. I assumed that a declaration double[totsize] is already static declaration.

If the declaration is inside a function, it is by default a dynamic variable that gets allocated on the stack. You probably need to increase the stack size limit with ulimit.

Hello,

Tahnk you for your reply. My arrays were being declared inside the main program and defined as h[totsize]. Until now I wrote programs only in Fortran and this declaration I thought it is static.

I am trying to learn “on the fly” both C and CUDA C. I am glad now that it works. Because I was running it in the slurm queueing system, there were no useful messages when it was crashing and it took me some time to detect the place where it was crashing. The debugger was also useless in this case.

Cristian