Hello,
I have a complicated code that is sometimes not running and causes display driver to stop. I simplifed it and try to find out problems. The simplifed version behaves same and still not consistent working. I think problem is about path divergence between threads. Can you help me working on this code?
[codebox]
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include “cutil_inline.h”
device unsigned int random(unsigned int a, unsigned int i, unsigned int range)
{
a = (a ^ 61) ^ (a >> 16);
a = a + (a << 3);
a = a + i;
a = a ^ (a >> 4);
a = a + i;
a = a * 0x27d4eb2d;
a = a ^ (a >> 15);
return ((a+i) % range);
}
global void
testKernel( int* g_idata, int* g_isSourceAvailable, int* g_isDemandAvailable, int* g_odata)
{
unsigned int demandCondition = 7;
unsigned int sourceId = 0;
unsigned int demandId = blockIdx.x * 5 + (threadIdx.x % 5);
// one thread per demand
if (threadIdx.x < 5)
{
demandId = blockIdx.x * 5 + (threadIdx.x % 5);
//demand condition
if (g_odata[demandId]<demandCondition)
{
there:
//If my demand element is available, lock it
if(atomicExch(&g_isDemandAvailable[demandId], 0))
{
//randomly select a source
sourceId = blockIdx.x * 3 + random(clock(), threadIdx.x * blockIdx.x + threadIdx.x, 3);
//If my source element is available, lock it
if(atomicExch(&g_isSourceAvailable[sourceId], 0))
{
// meet requirement
g_odata[demandId] = g_odata[demandId] + 1;
// consume source
g_idata[sourceId] = g_idata[sourceId] - 1;
// make source available, unlock
atomicExch(&g_isSourceAvailable[sourceId], 1);
}
// make demand available, unlock
atomicExch(&g_isDemandAvailable[demandId], 1);
}
__syncthreads();
// recontrol demand condition
if (g_odata[demandId]<demandCondition)
{
goto there;
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
cudaSetDevice( cutGetMaxGflopsDeviceId() );
int num_threads = 64;
int num_blocks = 4;
int mem_size = sizeof(int) * num_blocks;
int* h_isDemandAvailable = (int*) malloc(5 * mem_size);
int* h_isSourceAvailable = (int*) malloc(3 * mem_size);
// allocate host memory
int* h_idata = (int*) malloc(3 * mem_size);
// initalize the memory
for(int i = 0; i < num_blocks * 3; i++)
{
h_idata[i] = 100;
}
for(int i = 0; i < num_blocks * 3 ; i++)
{
h_isSourceAvailable[i] = 1;
}
for(int i = 0; i < num_blocks * 5 ; i++)
{
h_isDemandAvailable[i] = 1;
}
// allocate device memory
int* d_idata;
cutilSafeCall( cudaMalloc( (void**) &d_idata, mem_size * 3));
int* d_isSourceAvailable;
cutilSafeCall( cudaMalloc( (void**) &d_isSourceAvailable, mem_size*3));
int* d_isDemandAvailable;
cutilSafeCall( cudaMalloc( (void**) &d_isDemandAvailable, mem_size*5));
// copy host memory to device
cutilSafeCall( cudaMemcpy( d_idata, h_idata, mem_size * 3,
cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy( d_isSourceAvailable, h_isSourceAvailable, mem_size * 3,
cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy( d_isDemandAvailable, h_isDemandAvailable, mem_size * 5,
cudaMemcpyHostToDevice) );
// allocate device memory for result
int* d_odata;
cutilSafeCall( cudaMalloc( (void**) &d_odata, mem_size*5));
// setup execution parameters
dim3 grid( num_blocks, 1, 1);
dim3 threads( num_threads, 1, 1);
// execute the kernel
testKernel<<< grid, threads, mem_size >>>( d_idata, d_isSourceAvailable, d_isDemandAvailable, d_odata);
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
// allocate mem for the result on host side
int* h_odata = (int*) malloc( mem_size*5);
// copy result from device to host
cutilSafeCall( cudaMemcpy( h_odata, d_odata, mem_size*5,
cudaMemcpyDeviceToHost) );
for( int i = 0; i < num_blocks * 5; ++i)
{
printf( "%d \n", h_odata[i]);
}
// cleanup memory
free( h_idata);
free( h_isSourceAvailable);
free( h_isDemandAvailable);
free( h_odata);
cutilSafeCall(cudaFree(d_idata));
cutilSafeCall(cudaFree(d_isSourceAvailable));
cutilSafeCall(cudaFree(d_isDemandAvailable));
cutilSafeCall(cudaFree(d_odata));
cudaThreadExit();
cutilExit(argc, argv);
}
[/codebox]
My code is on Windows Vista OS and G105M compute capability 1.1 card. I cannot debug this kernel because Nsight doesnt support G105M GPU. Anyone who has fermi card can try this code on their devices? Maybe problem can be solved by compute capability 2.0?
Thanks