cudaMemcopy bug

pasoleatis · September 19, 2011, 7:59am

Hello,

I have a problem with simple code. I managed to find that program crashes at the cudamemcopy. I use 2D matrices. It works for size smaller than 9600x9600, and it crashes for larger sizes. If I remove the cudamemcopy line it works without problem. The size of the matrix is about 1.7 GB.

Operating System: RHEL 5.7
System description: Intel Xeon X5650 CPUs with six cores each, 24 GB RAM, NVIDIA Tesla M2070

I only get this error message from the queuing system:

    Loading CUDA version 4.0.17

    Note: Programming environment is now PrgEnv-gnu

Command terminated by signal 11
srun: error: g6: task 0: Exited with exit code 11
srun: Terminating job step 749571.0
0.00user 0.00system 0:00.01elapsed 18%CPU (0avgtext+0avgdata 7936maxresident)k
0inputs+0outputs (2major+545minor)pagefaults 0swaps
slurmd[g6]: *** STEP 749571.0 KILLED AT 2011-09-19T10:53:59 WITH SIGNAL 9 ***

This is the code I used and the header for the errors is included:

#include
#include
#include
#include “error_checks.h” // Macros CUDA_CHECK and CHECK_ERROR_MSG
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>

int main(void)
{

cudaDeviceProp  prop;

int Gcount;
CUDA_CHECK( cudaGetDeviceCount( &Gcount ) );
for (int i=0; i< Gcount; i++) {
    CUDA_CHECK( cudaGetDeviceProperties( &prop, i ) );
    printf( "   --- General Information for device %d ---\n", i );
    printf( "Name:  %s\n", prop.name );
    printf( "Compute capability:  %d.%d\n", prop.major, prop.minor );
    printf( "Clock rate:  %d\n", prop.clockRate );
    printf( "Device copy overlap:  " );
    if (prop.deviceOverlap)
        printf( "Enabled\n" );
    else
        printf( "Disabled\n");
    printf( "Kernel execution timeout :  " );
    if (prop.kernelExecTimeoutEnabled)
        printf( "Enabled\n" );
    else
        printf( "Disabled\n" );

    printf( "   --- Memory Information for device %d ---\n", i );
    printf( "Total global mem:  %ld\n", prop.totalGlobalMem );
    printf( "Total constant Mem:  %ld\n", prop.totalConstMem );
    printf( "Max mem pitch:  %ld\n", prop.memPitch );
    printf( "Texture Alignment:  %ld\n", prop.textureAlignment );

    printf( "   --- MP Information for device %d ---\n", i );
    printf( "Multiprocessor count:  %d\n",
                prop.multiProcessorCount );
    printf( "Shared mem per mp:  %ld\n", prop.sharedMemPerBlock );
    printf( "Registers per mp:  %d\n", prop.regsPerBlock );
    printf( "Threads in warp:  %d\n", prop.warpSize );
    printf( "Max threads per block:  %d\n",
                prop.maxThreadsPerBlock );
    printf( "Max thread dimensions:  (%d, %d, %d)\n",
                prop.maxThreadsDim[0], prop.maxThreadsDim[1],
                prop.maxThreadsDim[2] );
    printf( "Max grid dimensions:  (%d, %d, %d)\n",
                prop.maxGridSize[0], prop.maxGridSize[1],
                prop.maxGridSize[2] );
    printf( "\n" );
}

size_t free, total;
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB at the beginning\n”,free/1024,total/1024);

clock_t start, end;
double cpu_time_used;

start = clock();

const int lcellx=16;
const int ncellx=600; //,ncelly=ncellx,ncellz=ncellx;
const int lx=ncellxlcellx,ly=lx;
const int totsize=lxly,totsize_pad=lx2(ly/2+1),totsize_invspa=lx*(ly/2+1); const double Pi=acos(-1.0);
const double dt=0.5;
const double at=7.255197456936871;
const double qt=2acos(-1.0)/at;
const double dx=at/((double)lcellx),dy=dxsqrt(3.0)/2.0;
const double r=-0.25,pm=-0.25;
int nend=6000;
int nsteps=2;
double kx,ky;
double ene,ppmm;

double *dqq;
cufftDoubleReal *dpsi,*dbff;
cufftHandle prc,pcr;

cufftDoubleReal hbff[totsize_pad],hpsi[totsize_pad];
double hqq[totsize_invspa];

printf(“\n”);
printf(" %d KB memory which will be allocated direct by the user on the device. lx= %d ly= %d\n",(2*totsize_pad+totsize_invspa)*sizeof(double)/1024,lx,ly);
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB before the cufft plans are made .\n”,free/1024,total/1024);

cufftPlan2d(&prc,lx,ly,CUFFT_D2Z);
cufftPlan2d(&pcr,lx,ly,CUFFT_Z2D);
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB after the cufft plans are made.\n”,free/1024,total/1024);

dim3 grid,threads;
threads.x=1024;
threads.y=1;
threads.z=1;
grid.x=(int)ceil((double)lx/sqrt((double)threads.x));
grid.y=(int)ceil((double)2*(ly/2+1)/sqrt((double)threads.x));
grid.z=1;

printf(“\n”);
printf(“%d %d %d %d\n”,grid.x,grid.y,threads.xgrid.xgrid.y,totsize_pad);
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB before allocations.\n”,free/1024,total/1024);
int count;
count=0;
ppmm=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{

hpsi[count]=-0.1*(cos(qt*i*dx)*cos(qt*j*dy/sqrt(3.0f))-0.5*cos(2*qt*j*dy/sqrt(3.0f)))+pm;   
ppmm=ppmm+hpsi[count];
}
else
{
hpsi[count]=0;
}
count=count+1;  
}}  
printf("%66.60lf\n", ppmm/(double)totsize);
 
count=0;
for(int i=0;i<lx;i++)
{   
if(i<=lx/2)
{kx=i*2*Pi/((double)lx*dx);}
else   
{kx=(i-lx)*2*Pi/((double)lx*dx);}   

for(int j=0;j<=ly/2;j++)
{
ky=j*2*Pi/((double)ly*dy);

hqq[count]=-(kx*kx+ky*ky);

count=count+1;
}}

CUDA_CHECK( cudaMalloc((void**)&dpsi, sizeof(cufftDoubleReal)totsize_pad) );
CUDA_CHECK( cudaMalloc((void*)&dbff, sizeof(cufftDoubleReal)totsize_pad) );
CUDA_CHECK( cudaMalloc((void*)&dqq, sizeof(double)*totsize_invspa) );

printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB after allocations.\n”,free/1024,total/1024);

cudaThreadSynchronize();
CUDA_CHECK( cudaMemcpy(dqq, hqq, sizeof(double)*totsize_invspa,cudaMemcpyHostToDevice) );
cudaThreadSynchronize();
CUDA_CHECK( cudaMemcpy(dpsi, hpsi, sizeof(cufftDoubleReal)*totsize_pad,cudaMemcpyHostToDevice) );
cudaThreadSynchronize();


end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("%66.60lf \n",cpu_time_used);


CUDA_CHECK( cudaFree((void*)dpsi) );
CUDA_CHECK( cudaFree((void*)dbff) );
CUDA_CHECK( cudaFree((void*)dqq) );
cufftDestroy(prc);
cufftDestroy(pcr);

printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB at the end.\n”,free/1024,total/1024);

return 0;

}
error_checks.h (854 Bytes)

pasoleatis · September 19, 2011, 12:52pm

It seems that I found the answer to my own question. There is some problem with large transfers. The solution was to declare the host arrays as static. This solved the problem, but I have no idea why. I assumed that a declaration double[totsize] is already static declaration.

tera · September 19, 2011, 1:16pm

If the declaration is inside a function, it is by default a dynamic variable that gets allocated on the stack. You probably need to increase the stack size limit with ulimit.

pasoleatis · September 19, 2011, 4:45pm

Hello,

Tahnk you for your reply. My arrays were being declared inside the main program and defined as h[totsize]. Until now I wrote programs only in Fortran and this declaration I thought it is static.

I am trying to learn “on the fly” both C and CUDA C. I am glad now that it works. Because I was running it in the slurm queueing system, there were no useful messages when it was crashing and it took me some time to detect the place where it was crashing. The debugger was also useless in this case.

Cristian

Topic		Replies	Views
GTX Titan Black Memory Issues CUDA Programming and Performance	8	1640	December 14, 2016
cudaMalloc(long*, size) failure on Tesla C2070 device CUDA Programming and Performance	4	1429	March 15, 2012
Cuda application crashes works fine for small data and crashes for big data CUDA Programming and Performance	3	412	October 12, 2021
cudaMalloc error in big loop CUDA Programming and Performance	12	15602	May 21, 2008
matrix multiplication with its transpose in cuda(cudamemcpy from device to host not working) . CUDA Programming and Performance	6	1754	October 5, 2018
CUDA 5.0 - cudaGetDeviceProperties returns very strange grid size CUDA Programming and Performance	10	4733	June 6, 2013
Cuda application crashes works fine for small data and crashes for big data CUDA Developer Tools	0	361	December 8, 2020
Setting up CUDAMPF program problem core dumped CUDA Setup and Installation	4	721	January 14, 2019
Cudamalloc attempting to allocate more memory than it is supposed to CUDA Programming and Performance cuda	15	60	January 13, 2025
Can a Kernel be too big?? CUDA_ERROR_NO_BINARY_FOR_GPU error 209 CUDA Programming and Performance	11	3021	November 13, 2017

cudaMemcopy bug

Related topics