Hello all,
We’ve been trying to run this piece of code which is step one for us in a long series of steps. We’re getting an unspecified launch failure after the second kernel call. Without the cutilCheckMsg() call, input_d is filled with all zeros or zeros with garbage values. We’ve tried higher numbers for NO_BITS. Basically, NUMBLOCKS*NUMTHREADS should be equal to PERMUTATIONS. Please help.
[codebox]#include<stdio.h>
#include<stdlib.h>
#include<sys/time.h>
#include<math.h>
#include"macros.h"
#define NUMBLOCKS 144
#define NUMTHREADS 32
#define NO_BITS 9
#define PERMUTATIONS pow(2,NO_BITS)
global void init(int *input_d){
int index=blockIdx.x*blockDim.x + threadIdx.x;
input_d[index]=0;
}
global void input_set(int *input_d){
int index=blockIdx.x*blockDim.x + threadIdx.x;
int i,k;
i=index;
k=(index+1)*NO_BITS-1;
do{
input_d[k]=i%2;
i=i/2;
k--;
}while(i!=0);
}
int main(){
unsigned long long int k=0;
float timetaken;
struct timeval TimeValue_Start, TimeValue_Final;
struct timezone TimeZone_Start, TimeZone_Final;
int *input_h;
int *input_d;
dim3 dimGrid(NUMBLOCKS);
dim3 dimBlock(NUMTHREADS);
gettimeofday(&TimeValue_Start, &TimeZone_Start);
input_h=(int *)malloc(PERMUTATIONS*NO_BITS*sizeof(int));
cudaMalloc((void **)&input_d,PERMUTATIONS*NO_BITS*sizeof(int));
init<<< dimGrid,dimBlock >>>(input_d);
cudaThreadSynchronize();
input_set<<< dimGrid,dimBlock >>>(input_d);
cutilCheckMsg("Kernel execution failed");
cudaMemcpy(input_h,input_d,PERMUTATIONS*NO_BITS*sizeof(int),
cudaMemcpyDeviceToHost);
for(k=0;k<PERMUTATIONS*NO_BITS;k++){
if(k%NO_BITS==0)
printf("\n");
printf("%d",input_h[k]);
}
gettimeofday(&TimeValue_Final, &TimeZone_Final);
timetaken = (double) (TimeValue_Final.tv_sec - TimeValue_Start.tv_sec) + (double) (TimeValue_Final.tv_usec - TimeValue_Start.tv_usec)/ 1000000.0;
printf("\nTime taken : %lf \n", timetaken);
free(input_h);
cudaFree(input_d);
}[/codebox]