I have a new 285GTX. Now I would test the datatype double.
I have created a simple testkernel. This kernel multiplies a 1 dimensional vector with a constant factor. But the result is zero. If I use the datatype float, then the result is right.
I use CUDA 2.1 Software.
Here ist my code:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <cutil_inline.h>
static global void absolute_value(double*, double*, double, unsigned int);
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char** argv);
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 512*256
#define coeff 0.0135
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
runTest(argc, argv);
cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char** argv)
{
if( cutCheckCmdLineFlag(argc, (const char**)argv, “device”) )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
// Allocate host memory for the signal
double* h_signal_x = (double*)malloc(sizeof(double) * SIGNAL_SIZE);
double* h_results = (double*)malloc(sizeof(double) * SIGNAL_SIZE);
// Initalize the memory for the signal
for(unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal_x[i] = 3.0;
}
for(unsigned int i = 0; i < 10; ++i)
{
printf("Wert: %f \n", h_signal_x[i]);
}
int mem_size = sizeof(double) * SIGNAL_SIZE;
// Allocate device memory for input signal
double* d_in;
cutilSafeCall(cudaMalloc((void**)&d_in, mem_size));
// Copy host memory to device
cutilSafeCall(cudaMemcpy(d_in, h_signal_x, mem_size, cudaMemcpyHostToDevice));
// Allocate device memory for result
double* d_out;
cutilSafeCall(cudaMalloc((void**)&d_out, mem_size));
// Multiply the coefficients together and normalize the result
absolute_value<<<SIGNAL_SIZE/512, 512>>>(d_in,d_out,coeff,SIGNAL_SIZE);
// Check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed [ absolute_value ]");
// Copy device memory to host
cutilSafeCall(cudaMemcpy(h_results, d_out, mem_size, cudaMemcpyDeviceToHost));
// Das Ergebnis anzeigen
for (unsigned int i = 0; i < 10; ++i)
{
printf(" Ergebnis: %2.20f \n", h_results[i]);
}
// cleanup memory
free(h_signal_x);
cutilSafeCall(cudaFree(d_in));
cudaThreadExit();
}
// Betrag berechnen
global void absolute_value(double *d_in, double *d_out, double y, unsigned int length)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
if( tid >= length )
{
return;
}
d_out[tid] = d_in[tid] * y;
}