Hi, I’m currently experiencing a strange behavior on my new geforce 540m:
I have written a very simple kernel to add vectors. When i put in pretty large vectors, it crashes, depending on the blocksize, e.g.:
BS: 256 Works with ~1.5 million floating point values as input, but crashes at 2 million
BS: 512 Works with ~16 million values, but crashes at 32 million
BS: 1024 Works until my main application says “bad_alloc” …probably no cuda problem External Image
Does anyone know about this problem or has a solution?
#include "cuda_add.h"
__global__ void addiere(float *a, float *b, float *c, unsigned long count)
{
unsigned long idx=blockIdx.x * blockDim.x + threadIdx.x;
if (idx<count)
{
c[idx]=a[idx]+b[idx];
}
}
void cu_addiere(float *pfA, float *pfB, float *pfC, unsigned long ulCount, struct etime *sTime)
{
cudaEvent_t start_function, start_kernel, stop_kernel, stop_function;
cudaEventCreate(&start_function);
cudaEventCreate(&start_kernel);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_function);
cudaEventRecord(start_function);
float *d_pfA=NULL;
float *d_pfB=NULL;
float *d_pfC=NULL;
cudaMalloc((void**)&d_pfA, sizeof(float)*ulCount);
cudaMalloc((void**)&d_pfB, sizeof(float)*ulCount);
cudaMalloc((void**)&d_pfC, sizeof(float)*ulCount);
cudaMemcpy(d_pfA, pfA, sizeof(float)*ulCount, cudaMemcpyHostToDevice);
cudaMemcpy(d_pfB, pfB, sizeof(float)*ulCount, cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);
cudaEventSynchronize(start_kernel);
addiere<<<ulCount/1024+1, 1024>>>(d_pfA,d_pfB,d_pfC,ulCount);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
cudaMemcpy(pfC, d_pfC, sizeof(float)*ulCount, cudaMemcpyDeviceToHost);
cudaEventRecord(stop_function);
cudaEventSynchronize(stop_function);
cudaEventElapsedTime(&(sTime->pre_kernel), start_function, start_kernel);
cudaEventElapsedTime(&(sTime->kernel), start_kernel, stop_kernel);
cudaEventElapsedTime(&(sTime->post_kernel), stop_kernel, stop_function);
}