my GPU is GTX280,revision number is 1.3, nvidia said it should support double ,but my
test show it can not support double, who can give a suggestion? or,who in nvidia company
give me a explanation.
my email: zhangyuaniecas@gmail.com
the 1) program show GTX280 can not support double
the 2) program show CUDA2.0 cos/sin/tan can not support double
//--------------------------double test------------------------------------------------
#include <stdio.h>
#include <cutil.h>
#define VARTYPE double
//#define VARTYPE float
global void test(VARTYPE *data) {
int id = threadIdx.x + threadIdx.y * gridDim.x;
data[id] = id;
}
#define NB 16
int main() {
dim3 blockSize(NB, 1);
VARTYPE dbg;
VARTYPE res[NB];
CUDA_SAFE_CALL(cudaMalloc((void*)&dbg, sizeof(*dbg) * NB));
test<<<1, blockSize>>>(dbg);
CUT_CHECK_ERROR(“1. kernel”);
CUDA_SAFE_CALL(cudaMemcpy(res, dbg, sizeof(*res) * NB, cudaMemcpyDeviceToHost));
//CUDA_SAFE_CALL(cudaThreadSynchronize());
for (int i = 0; i != NB; i++) printf(“%3d %f\n”, i , res[i]);
CUDA_SAFE_CALL(cudaFree(dbg));
}
//--------------------------------------------------------------------------
//---------------------------cos test------------------------------------------------
#include<stdio.h>
#include<cutil.h>
#include<unistd.h>
typedef float FP_t;
//typedef double FP_t;
int line=10;
int print_count=20;
int BATCH=10;
int memsz=0;
FP_t test_num=1.56;
static global void do_cos(FP_t* in,int N,int nbatch)
{
for(int bid=blockIdx.x; bid< nbatch; bid+=gridDim.x)
{
FP_t* tmp = in+ bid*N;
for(int tid=threadIdx.x; tid<N; tid+=blockDim.x)
{
tmp[tid]=cos(tmp[tid]);
}
}
}
int main(int argc, char * argv)
{
CUT_DEVICE_INIT(argc,argv);
FP_t* d_ptr_in;
FP_t* h_ptr;
memsz=sizeof(FP_t)lineBATCH;
CUDA_SAFE_CALL(cudaMalloc((void **)&(d_ptr_in), (memsz)));
CUDA_SAFE_CALL(cudaMallocHost((void **)&(h_ptr), (memsz)));
for(int i=0;i<line*BATCH;i++)h_ptr[i]=test_num;
for(int i=0;i<print_count;i++) printf("cpu:%20.10f\n",cos(h_ptr[0]));
CUDA_SAFE_CALL(cudaMemcpy((d_ptr_in), (h_ptr), (memsz),cudaMemcpyHostToDevice));
do_cos<<<20,128>>>(d_ptr_in,line,BATCH);
CUT_CHECK_ERROR("kernel failed");
CUDA_SAFE_CALL(cudaMemcpy((h_ptr), (d_ptr_in), (memsz),cudaMemcpyDeviceToHost));
for(int i=0;i<print_count;i++)printf("gpu:%20.10f\n",h_ptr[i]);
CUT_EXIT(argc,argv);
}
//-----------------------------------------------------------------------------------------