Hello,
I have a problem with simple code. I managed to find that program crashes at the cudamemcopy. I use 2D matrices. It works for size smaller than 9600x9600, and it crashes for larger sizes. If I remove the cudamemcopy line it works without problem. The size of the matrix is about 1.7 GB.
- Operating System: RHEL 5.7
- System description: Intel Xeon X5650 CPUs with six cores each, 24 GB RAM, NVIDIA Tesla M2070
I only get this error message from the queuing system:
Loading CUDA version 4.0.17
Note: Programming environment is now PrgEnv-gnu
Command terminated by signal 11
srun: error: g6: task 0: Exited with exit code 11
srun: Terminating job step 749571.0
0.00user 0.00system 0:00.01elapsed 18%CPU (0avgtext+0avgdata 7936maxresident)k
0inputs+0outputs (2major+545minor)pagefaults 0swaps
slurmd[g6]: *** STEP 749571.0 KILLED AT 2011-09-19T10:53:59 WITH SIGNAL 9 ***
This is the code I used and the header for the errors is included:
#include
#include
#include
#include “error_checks.h” // Macros CUDA_CHECK and CHECK_ERROR_MSG
#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>
int main(void)
{
cudaDeviceProp prop;
int Gcount;
CUDA_CHECK( cudaGetDeviceCount( &Gcount ) );
for (int i=0; i< Gcount; i++) {
CUDA_CHECK( cudaGetDeviceProperties( &prop, i ) );
printf( " --- General Information for device %d ---\n", i );
printf( "Name: %s\n", prop.name );
printf( "Compute capability: %d.%d\n", prop.major, prop.minor );
printf( "Clock rate: %d\n", prop.clockRate );
printf( "Device copy overlap: " );
if (prop.deviceOverlap)
printf( "Enabled\n" );
else
printf( "Disabled\n");
printf( "Kernel execution timeout : " );
if (prop.kernelExecTimeoutEnabled)
printf( "Enabled\n" );
else
printf( "Disabled\n" );
printf( " --- Memory Information for device %d ---\n", i );
printf( "Total global mem: %ld\n", prop.totalGlobalMem );
printf( "Total constant Mem: %ld\n", prop.totalConstMem );
printf( "Max mem pitch: %ld\n", prop.memPitch );
printf( "Texture Alignment: %ld\n", prop.textureAlignment );
printf( " --- MP Information for device %d ---\n", i );
printf( "Multiprocessor count: %d\n",
prop.multiProcessorCount );
printf( "Shared mem per mp: %ld\n", prop.sharedMemPerBlock );
printf( "Registers per mp: %d\n", prop.regsPerBlock );
printf( "Threads in warp: %d\n", prop.warpSize );
printf( "Max threads per block: %d\n",
prop.maxThreadsPerBlock );
printf( "Max thread dimensions: (%d, %d, %d)\n",
prop.maxThreadsDim[0], prop.maxThreadsDim[1],
prop.maxThreadsDim[2] );
printf( "Max grid dimensions: (%d, %d, %d)\n",
prop.maxGridSize[0], prop.maxGridSize[1],
prop.maxGridSize[2] );
printf( "\n" );
}
size_t free, total;
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB at the beginning\n”,free/1024,total/1024);
clock_t start, end;
double cpu_time_used;
start = clock();
const int lcellx=16;
const int ncellx=600; //,ncelly=ncellx,ncellz=ncellx;
const int lx=ncellxlcellx,ly=lx;
const int totsize=lxly,totsize_pad=lx2(ly/2+1),totsize_invspa=lx*(ly/2+1); const double Pi=acos(-1.0);
const double dt=0.5;
const double at=7.255197456936871;
const double qt=2acos(-1.0)/at;
const double dx=at/((double)lcellx),dy=dxsqrt(3.0)/2.0;
const double r=-0.25,pm=-0.25;
int nend=6000;
int nsteps=2;
double kx,ky;
double ene,ppmm;
double *dqq;
cufftDoubleReal *dpsi,*dbff;
cufftHandle prc,pcr;
cufftDoubleReal hbff[totsize_pad],hpsi[totsize_pad];
double hqq[totsize_invspa];
printf(“\n”);
printf(" %d KB memory which will be allocated direct by the user on the device. lx= %d ly= %d\n",(2*totsize_pad+totsize_invspa)*sizeof(double)/1024,lx,ly);
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB before the cufft plans are made .\n”,free/1024,total/1024);
cufftPlan2d(&prc,lx,ly,CUFFT_D2Z);
cufftPlan2d(&pcr,lx,ly,CUFFT_Z2D);
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB after the cufft plans are made.\n”,free/1024,total/1024);
dim3 grid,threads;
threads.x=1024;
threads.y=1;
threads.z=1;
grid.x=(int)ceil((double)lx/sqrt((double)threads.x));
grid.y=(int)ceil((double)2*(ly/2+1)/sqrt((double)threads.x));
grid.z=1;
printf(“\n”);
printf(“%d %d %d %d\n”,grid.x,grid.y,threads.xgrid.xgrid.y,totsize_pad);
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB before allocations.\n”,free/1024,total/1024);
int count;
count=0;
ppmm=0;
for(int i=0;i<lx;i++)
{
for(int j=0;j<2*(ly/2+1);j++)
{
if(j<ly)
{
hpsi[count]=-0.1*(cos(qt*i*dx)*cos(qt*j*dy/sqrt(3.0f))-0.5*cos(2*qt*j*dy/sqrt(3.0f)))+pm;
ppmm=ppmm+hpsi[count];
}
else
{
hpsi[count]=0;
}
count=count+1;
}}
printf("%66.60lf\n", ppmm/(double)totsize);
count=0;
for(int i=0;i<lx;i++)
{
if(i<=lx/2)
{kx=i*2*Pi/((double)lx*dx);}
else
{kx=(i-lx)*2*Pi/((double)lx*dx);}
for(int j=0;j<=ly/2;j++)
{
ky=j*2*Pi/((double)ly*dy);
hqq[count]=-(kx*kx+ky*ky);
count=count+1;
}}
CUDA_CHECK( cudaMalloc((void**)&dpsi, sizeof(cufftDoubleReal)totsize_pad) );
CUDA_CHECK( cudaMalloc((void*)&dbff, sizeof(cufftDoubleReal)totsize_pad) );
CUDA_CHECK( cudaMalloc((void*)&dqq, sizeof(double)*totsize_invspa) );
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB after allocations.\n”,free/1024,total/1024);
cudaThreadSynchronize();
CUDA_CHECK( cudaMemcpy(dqq, hqq, sizeof(double)*totsize_invspa,cudaMemcpyHostToDevice) );
cudaThreadSynchronize();
CUDA_CHECK( cudaMemcpy(dpsi, hpsi, sizeof(cufftDoubleReal)*totsize_pad,cudaMemcpyHostToDevice) );
cudaThreadSynchronize();
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("%66.60lf \n",cpu_time_used);
CUDA_CHECK( cudaFree((void*)dpsi) );
CUDA_CHECK( cudaFree((void*)dbff) );
CUDA_CHECK( cudaFree((void*)dqq) );
cufftDestroy(prc);
cufftDestroy(pcr);
printf(“\n”);
cudaMemGetInfo(&free,&total);
printf(“%d KB free of total %d KB at the end.\n”,free/1024,total/1024);
return 0;
}
error_checks.h (854 Bytes)