Seems to depend upon the amount of time the kernel takes to run. So reducing the resolution or the number of iterations in the ‘iterate’ function and things work OK. New to CUDA and have no idea why this would be happening…
Windows 7 64 bit, GTX770 and latest drivers.
#include <stdio.h>
#include <windows.h>
#include "helper_cuda.h"
#include "cuComplex.h"
#define MAX_RADIUS_2 4.0F
#define CCE checkCudaErrors
__device__ void iterate( cuFloatComplex z,
cuFloatComplex c,
int imax )
{
cuFloatComplex tc;
for ( int i = 0; i < imax; ++ i ) {
// z = cuCaddf( cuCmulf( z, z ), c );
tc = cuCmulf( z, z );
z = cuCaddf( tc, c );
}
}
__global__ void julia( )
{
cuFloatComplex z;
cuFloatComplex c;
z.x = 0.0F;
z.y = 0.0F;
c.x = 0.0F;
c.y = 0.0F;
iterate( z, c, 100 );
}
void Julia( size_t res_x, size_t res_y )
{
printf( "Julia Start\n" ); fflush( stdout );
dim3 grid( res_x / 16, res_y / 16 );
dim3 threads( 16, 16 );
julia<<< grid, threads >>>( );
CCE( cudaDeviceSynchronize( ) );
printf( "Julia Stop\n" ); fflush( stdout );
}
int CudaInit( )
{
int i;
int count = 0;
cudaGetDeviceCount( & count );
if ( count == 0 )
exit( -1 );
for ( i = 0; i < count; ++ i ) {
cudaDeviceProp prop;
if ( cudaGetDeviceProperties( & prop, i ) == cudaSuccess ) {
if ( prop.major >= 1 )
break;
}
}
if ( i == count )
exit( -1 );
cudaSetDevice( i );
return 0;
}
#define RX 4096
#define RY 4096
int main( int argc, const char ** argv )
{
CudaInit( );
Julia( RX, RY );
CCE( cudaDeviceReset( ) );
}