I am currently running cuda 2.0b2 on at gtx280, and I am encountering some very strange behavior. I’m calling cuda programs from a vc++ program that allocates the memory once and then passes pointers. It works fine in debug, but produces the wrong output if I use MT Dll instead of MT Dll debug libraries. Second when running with the debug libraries it works fine for a few iterations (the code is called over and over again), then the screen flickers and it stops working correctly.
In fact afterwords if I run the code again it runs slower, and does the same thing and if I run any other program which uses the graphics card it preforms very poorly.
I think I’m making some sort of horrible memory error. All the memory is allocated in the main program with cudaMalloc and then filled using the cudaMemcpy command. Then the LDOS code is called. It calls integrated, then summation which uses the reduced code given in the SDK.
Any help about what I am doing wrong would be greatly appreciated.
global void integrated(float * en, float gamma1, float gamma2,float D00, float B, float gaps,
float * gapm, float * gapm2, float * ekm, float * distance, float * sum) {
int i=blockIdx.x * blockDim.x + threadIdx.x;
float gapma, gamma=gamma1+gamma2en[i];
if(fabs(gapm[i]gaps) <= D00) {
gapma=gapm[i]gapsB+gapm2[i]gaps(1.0-B);
} else {
gapma=gapsgapm[i];
}
float selfei=-gamma-gapmagapmagamma/((en[i]+ekm[i])(en[i]+ekm[i])+gammagamma);
float selfer=gapmagapma*(en[i]+ekm[i])/((en[i]+ekm[i])(en[i]+ekm[i])+gammagamma);
sum[i]=selfei/((en[i]-ekm[i]-selfer)(en[i]-ekm[i]-selfer)+(selfeiselfei))*(distance[i]);
}
host void summation(float sum, float * reducedout, int xsize, int ysize, int nosts, float * sum2, float * poop, float tempadd) {
int blocks, threads, blocks2, threads2;
getNumBlocksAndThreads(6, xsizeysize, xsize0.5, ysize0.5, blocks, threads);
getNumBlocksAndThreads(6, blocks, xsize0.5, xsize0.5, blocks2, threads2);
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(blocks, 1, 1);
for(int i=0;i<nosts;i++) {
cudaMemset((void **) &tempadd,0,2.0xsizesizeof(float));
reduce(ysizexsize, threads, blocks, sum, tempadd, xsize, ysize, i);
reduce(blocks,threads2,blocks2,tempadd,poop,0,0,0);
cudaMemcpy(sum2,poop,blocks2*sizeof(float),cudaMemcpyDeviceToHost);
reducedout[i]=sum2[0];
}
}
host void ldos(float * en, float gamma1, float gamma2, float D00, float B, float gaps,
float * gapm, float * gapm2, float * ekm, float * distance, int xsize,
int ysize, int nosts, float * output, float * sum, float * outputh,
float *sum2, float * tempadd, float * poop) {
int N, gridsize;
N=xsize*ysize*nosts;
dim3 dimBlock(256);
gridsize=N/256;
dim3 dimGrid(gridsize);
cudaMemset((void **) &sum,0,N*sizeof(float));
integrated<<<dimGrid,dimBlock>>>(en, gamma1, gamma2, D00, B, gaps, gapm, gapm2, ekm, distance, sum);
summation(sum, output, xsize, ysize, nosts, sum2, poop, tempadd);
}