Hi,
I am using CUDA under Windows XP. According to the following CUDA release notes:
I use a Geforce 6800GT as the primary graphics card for displaying and a 8800 GTX for computation. The Windows Display Driver version 97.73 for CUDA Toolkit Version 0.8 is installed for both of them.
Here is the host code:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include <CUDA_test_kernel.cu>
#define BLOCKNUM 16
#define THREADNUM 32
void runTest( int argc, char** argv);
int
main( int argc, char** argv)
{
runTest( argc, argv);
CUT_EXIT(argc, argv);
}
void
runTest( int argc, char** argv)
{
FILE* output = fopen("output", "w");
int memsize_byte = sizeof(int) * BLOCKNUM * THREADNUM;
int *d_output;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_output, memsize_byte ) );
dim3 grid(BLOCKNUM, 1, 1);
dim3 threads(THREADNUM, 1, 1);
//Set timer
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
printf("Begin testing...\n");
testKernel<<<grid, threads>>>(d_output);
CUT_CHECK_ERROR("Kernel execution failed");
printf("Computation completed.\n");
//Time
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
// allocate mem for the result on host side
int* results = (int*) malloc(memsize_byte);
// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(results, d_output, memsize_byte, cudaMemcpyDeviceToHost) );
for (int i = 0; i < BLOCKNUM * THREADNUM; ++i)
{
fprintf(output, "%d, %d\n", i, results[i]);
}
fclose(output);
free(results);
CUDA_SAFE_CALL(cudaFree(d_output));
}
And the kernel:
__global__ void
testKernel(int* output)
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tempValue = 0;
for (int i = 0; i < 10000; ++i)
{
for (int j = 0; j < 10000; ++ j)
{
tempValue = max( (i + j) % 4, (i + j) % 3 ) + 2;
}
}
output[tid] = tempValue;
tempValue = 0;
}
There are no compiling or running error issued. However, the computation results are not always correct. By tuning the upper limit of i or j in the kernel, we can get different run time. For the current value, it is about 6 sec. When the upper limit of j is changed to, such as 5000. The run time will within 3 sec. Now, my problem is when the kernel runtime is beyond 5 sec, the results are all 0, when it is within 5 sec, results are correct.
Any suggestions are appreciated.