Hi guys, I’m relatively new to this CUDA computing.
I tried to run my kernel with certain number of blocks(159) and threads(768).
I get the expected results.
But when i increase the thread count by 1(i.e 769), the whole thing goes berserk.
And also when i increase the block count by 1(i.e. 160), it throws a UNSPECIFIED LAUNCH FAILURE ERROR CODE 4.
I’ve posted my code below.
Please help me out.
//100000 rounds i.e 1000*100 = 1700 ms
//max threads=768 ................if it exceeds, cnt value is reset to 0;
//max blocks=159..................if it exceeds, unspecified launch failure error code 4;
//Implemented using atomicAdd on global memory
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <ctime>
// CUDA runtime
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA
#include "helper_functions.h"
#include "helper_cuda.h"
#include "curand.h"
#include <curand_kernel.h>
#define ul unsigned int
#define ull unsigned long long
#define BIASMAXLIMIT 0.51
#define BIASMINLIMIT 0.49
__device__ double cudaRand(int id)
{
curandState state;
curand_init((unsigned long long)clock(), id, 0, &state);
double d_out = curand_uniform_double(&state);
return d_out;
}
__device__ void initializeR(ul *x){
int l = threadIdx.x + blockIdx.x * blockDim.x;
#define myrand32 ((ul) (4294967296.0*((double)cudaRand(l))))
int i;
for (i = 0; i < 16; i++)
x[i] = myrand32;
x[0] = 0x61707865;
x[5] = 0x3320646e;
x[10] = 0x79622d32;
x[15] = 0x6b206574;
}
__device__ void copystate(ul *x1, ul *x){
int i;
for (i = 0; i < 16; i++)
x1[i] = x[i];
}
__device__ void print(ul *x){
int i;
for (i = 0; i < 16; i++){
printf("%8x ", x[i]);
if (i > 0 && i%4 == 3)
printf("\n");
}
printf("\n");
}
__device__ void qr(ul *x0, ul *x1, ul *x2, ul *x3){
#define rotateleft(x, n) (((x) << (n)) ^((x) >> (32-n)))
#define update(a, b, c, n) ((a)^(rotateleft(((b)+(c)),(n))))
ul z0, z1, z2, z3;
z1 = update(*x1, *x3, *x0, 7);
z2 = update(*x2, *x0, z1, 9);
z3 = update(*x3, z1, z2, 13);
z0 = update(*x0, z2, z3, 18);
*x0 = z0; *x1 = z1, *x2 = z2, *x3 = z3;
}
__device__ void transpose(ul *x){
ul temp;
temp=x[1]; x[1]=x[4]; x[4]=temp;
temp=x[2]; x[2]=x[8]; x[8]=temp;
temp=x[3]; x[3]=x[12]; x[12]=temp;
temp=x[6]; x[6]=x[9]; x[9]=temp;
temp=x[7]; x[7]=x[13]; x[13]=temp;
temp=x[11]; x[11]=x[14]; x[14]=temp;
}
__device__ void rounds(ul *x){
qr(&(x[0]), &(x[4]), &(x[8]),&(x[12]));
qr(&(x[5]), &(x[9]), &(x[13]),&(x[1]));
qr(&(x[10]), &(x[14]), &(x[2]),&(x[6]));
qr(&(x[15]), &(x[3]), &(x[7]),&(x[11]));
transpose(x);
}
__global__ void gen (unsigned int *d_cnt)
{
ul x[16], x1[16], pattern;
initializeR(x);
ull pt = 0x80000000;
copystate(x1, x);
x1[7] = x[7] ^ pt;
for (int i = 0; i < 4; i++)
{ rounds(x); rounds(x1); }
for (int k = 0; k < 16; k++)
{
pattern = 0x80000000;
for (int j = 31; j >= 0; j--)
{
if (((x[k] ^ x1[k]) & pattern) == 0)
{atomicAdd(&d_cnt[k*32+j],1);}
pattern = pattern >> 1;
}
}
}
int main()
{
int k, kmin, jmin, kmax=0, jmax=0;
double val, max, min;
FILE *fp;
// Allocate host memory for matrix cnt
unsigned int size_A = 16 * 32;
unsigned int mem_size_A = sizeof(unsigned int) * size_A;
unsigned int *h_cnt = (unsigned int *)malloc(mem_size_A);
for (int i = 0; i < size_A; ++i)
{
h_cnt[i] = 0;
}
// Allocate device memory
unsigned int *d_cnt;
cudaError_t error;
error = cudaMalloc((void **) &d_cnt, mem_size_A);
if (error != cudaSuccess)
{
printf("cudaMalloc d_cnt returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
// copy host memory to device
error = cudaMemcpy(d_cnt, h_cnt, mem_size_A, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
// Setup execution parameters
dim3 dimGrid(159,1,1);
dim3 dimBlock(768,1,1);
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
// Execute the kernel
int x=0;
long starttime=clock();
while(x<64)
{
gen<<<dimGrid,dimBlock>>>(d_cnt);
error = cudaMemcpy(h_cnt, d_cnt, mem_size_A, cudaMemcpyDeviceToHost);
if (error != cudaSuccess)
{
printf("cudaMemcpy (h_cnt,d_cnt) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
for (int p = 0; p < 16; p++)
{
for (int q = 0; q < 32; q++)
{
printf(" %u",h_cnt[p*32+q]);
}
printf("\n");
}
fp = fopen("ask1.dat", "w");
fprintf(fp, "Itr %d\n",x+1);
printf("Itr %d\n",x+1);
max = min = 0.5;
x++;
for (k = 0; k < 16; k++)
{
fprintf(fp, "For index %d\n ", k);
for (int j = 0; j < 32; j++)
{
val = (double)h_cnt[k*32+j]/((double)x*dimBlock.x*dimBlock.y*dimGrid.x);
if (val > max) {max = val; kmax = k;jmax = j;}
if (val < min) {min = val; kmin = k;jmin = j;}
fprintf(fp, "(%2d %lf)", j, val);
if (val >= BIASMAXLIMIT || val <=BIASMINLIMIT)
fprintf(fp, "* ");
else
fprintf(fp, " ");
if (j > 0 && j%8 == 7)
fprintf(fp, "\n");
}
fprintf(fp, "-------------\n");
}
fprintf(fp, "%d %d %.20lf %d %d%.20lf\n", kmin, jmin, min, kmax, jmax, max);
fclose(fp);
printf("%d %d %.20lf %d %d %.20lf\n",kmin, jmin, min, kmax, jmax, max);
long finishtime=clock();
printf("%ld\n", finishtime-starttime );
starttime=clock();
}
// Copy result from device to host
// Clean up memory
free(h_cnt);
cudaFree(d_cnt);
}