Problem with double datatype and 285GTX

I have a new 285GTX. Now I would test the datatype double.

I have created a simple testkernel. This kernel multiplies a 1 dimensional vector with a constant factor. But the result is zero. If I use the datatype float, then the result is right.

I use CUDA 2.1 Software.

Here ist my code:

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cufft.h>

#include <cutil_inline.h>

static global void absolute_value(double*, double*, double, unsigned int);

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void runTest(int argc, char** argv);

// The filter size is assumed to be a number smaller than the signal size

#define SIGNAL_SIZE 512*256

#define coeff 0.0135

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv)

{

runTest(argc, argv);

cutilExit(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////

//! Run a simple test for CUDA

////////////////////////////////////////////////////////////////////////////////

void runTest(int argc, char** argv)

{

if( cutCheckCmdLineFlag(argc, (const char**)argv, “device”) )

    cutilDeviceInit(argc, argv);

else

    cudaSetDevice( cutGetMaxGflopsDeviceId() );

// Allocate host memory for the signal

double* h_signal_x = (double*)malloc(sizeof(double) * SIGNAL_SIZE);

double* h_results = (double*)malloc(sizeof(double) * SIGNAL_SIZE);

// Initalize the memory for the signal

for(unsigned int i = 0; i < SIGNAL_SIZE; ++i) 

{

    h_signal_x[i] = 3.0;                   

}

for(unsigned int i = 0; i < 10; ++i)

{

    printf("Wert: %f \n", h_signal_x[i]);                

}

int mem_size = sizeof(double) * SIGNAL_SIZE;

// Allocate device memory for input signal

double* d_in;

cutilSafeCall(cudaMalloc((void**)&d_in, mem_size));

// Copy host memory to device

cutilSafeCall(cudaMemcpy(d_in, h_signal_x, mem_size, cudaMemcpyHostToDevice));

// Allocate device memory for result

double* d_out;

cutilSafeCall(cudaMalloc((void**)&d_out, mem_size));

// Multiply the coefficients together and normalize the result

absolute_value<<<SIGNAL_SIZE/512, 512>>>(d_in,d_out,coeff,SIGNAL_SIZE);

// Check if kernel execution generated and error

cutilCheckMsg("Kernel execution failed [ absolute_value ]");

// Copy device memory to host

cutilSafeCall(cudaMemcpy(h_results, d_out, mem_size, cudaMemcpyDeviceToHost));

// Das Ergebnis anzeigen

for (unsigned int i = 0; i < 10; ++i) 

{

    printf(" Ergebnis: %2.20f \n", h_results[i]);        

}

// cleanup memory

free(h_signal_x);   

cutilSafeCall(cudaFree(d_in));

cudaThreadExit();

}

// Betrag berechnen

global void absolute_value(double *d_in, double *d_out, double y, unsigned int length)

{

int tid = threadIdx.x + blockIdx.x*blockDim.x;

if( tid >= length )

{

    return;

}

d_out[tid] = d_in[tid] * y;

}

compile with -arch sm_13

Tomorrow I will test it.

This are my compiler parameters. How must I modify this options?

(CUDA_BIN_PATH)\nvcc.exe" -ccbin "(VCInstallDir)bin” -c -D_DEBUG -DWIN32 -D_CONSOLE -D_MBCS -Xcompiler /EHsc,/W3,/nologo,/Wp64,/Od,/Zi,/RTC1,/MTd -I"(CUDA_INC_PATH)" -I./ -I../../common/inc -o (ConfigurationName)\simpleCUFFT.obj simpleCUFFT.cu

1. Is my modification OK?

(CUDA_BIN_PATH)\nvcc.exe" <b>-arch sm_13</b> -ccbin "(VCInstallDir)bin” -c -D_DEBUG -DWIN32 -D_CONSOLE -D_MBCS -Xcompiler /EHsc,/W3,/nologo,/Wp64,/Od,/Zi,/RTC1,/MTd -I"(CUDA_INC_PATH)" -I./ -I../../common/inc -o (ConfigurationName)\simpleCUFFT.obj simpleCUFFT.cu