I calculated PI with integral. Code is the same, but number of threads per block was different. I found the it was really strange that PI is 3.13903 when I set 256 threads

per block.

Can any one explain why?

Besides, if I changed dimension of blocks1 to (8,2,1), result will be totally wrong. WHY?

the cpu result is:3.14159

time elapsed:16.177

CUDA initialized.

Processing time: 0.732636 (ms)

gpu result is:3.14159

with 32 per thread

Acceleration ratio:22.0805Processing time: 0.486263 (ms)

gpu result is:3.14159

with 64 per thread

Acceleration ratio:33.268Processing time: 0.397520 (ms)

gpu result is:3.14159

with 128 per thread

Acceleration ratio:40.6948Processing time: 0.367128 (ms)

gpu result is:3.13903

with 256 per thread

Acceleration ratio:44.0636Processing time: 0.361455 (ms)

gpu result is:3.14159

with 512 per thread

Acceleration ratio:44.7552Processing time: 0.480590 (ms)

gpu result is:3.14159

with 1024 per thread

Acceleration ratio:33.6607

here is my code:

```
/********************************************************************
* sample.cu
* This is a example of the CUDA program.
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
#include <iostream>
#include <windows.h>
using namespace std;
/************************************************************************/
/* Init CUDA */
/************************************************************************/
#if __DEVICE_EMULATION__
bool InitCUDA(void){return true;}
#else
bool InitCUDA(void)
{
int count = 0;
int i = 0;
cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA.\n");
return false;
}
cudaSetDevice(i);
printf("CUDA initialized.\n");
return true;
}
#endif
double cpuPI(int num){
double sum=0.0f;
double temp;
for(int i=0;i<num;i++){
temp=(i+0.5f)/num;
// printf ("%f\n", temp);
sum+=4/(1+temp*temp);
// printf ("%f\n", sum);
}
return sum/num;
}
__global__ void reducePI1 (float *d_sum, int num) {
int id=blockIdx.x*blockDim.x+threadIdx.x;//çº¿ç¨‹ç´¢å¼•
int gid=id;
float temp;
extern float __shared__ s_pi[];//
s_pi[threadIdx.x]=0.0f;
while(gid<num){
temp=(gid+0.5f)/num;//
s_pi[threadIdx.x]+=4.0f/(1+temp*temp);
gid+=blockDim.x*gridDim.x;
}
for(int i=(blockDim.x>>1);i>0;i>>=1){
if(threadIdx.x<i){
s_pi[threadIdx.x]+=s_pi[threadIdx.x+i];
}
__syncthreads(); //
}
if(threadIdx.x==0)
d_sum[blockIdx.x]=s_pi[0];
}
__global__ void reducePI2(float *d_sum,int num,float *d_pi){
int id=threadIdx.x;
extern float __shared__ s_sum[];
s_sum[id]=d_sum[id];
__syncthreads();
for(int i=(blockDim.x>>1);i>0;i>>=1){
if(id<i)
s_sum[id]+=s_sum[id+i];
__syncthreads();
}
// printf("%d,%f\n",id,s_sum[id]);
if(id==0){
*d_pi=s_sum[0]/num;
// printf("%d,%f\n",id,*pi);
}
}
int main(int argc, char* argv[])
{
DWORD t_S,t_E;
float t;
int n=100000;
t_S=GetTickCount();
cout<<"the cpu result is:";
cout<<cpuPI(1000000000)<<endl;;
t_E=GetTickCount();
t=t_E-t_S;
cout<<"time elapsed:"<<(float)t/1000<<endl;
if(!InitCUDA()) {
return 0;
}
float *d_sum;
float *d_pi;
float h_pi=1;
cudaMalloc(&d_sum,sizeof(float)*4);
cudaMalloc(&d_pi,sizeof(float));
int tn=16;
dim3 threads1(tn,1,1);
dim3 blocks1(2,2,1);// here to change dimension of blocks1
dim3 blocks2(1,1,1);
dim3 threads2(4,1,1);
//test
for(tn=32;tn<=1024;tn*=2)
{
threads1.x=tn;
unsigned int timer = 0;
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));
reducePI1<<<blocks1,threads1,sizeof(float)*tn>>>(d_sum,n);
cutilCheckMsg("PI1 failed\n");
reducePI2<<<blocks2,threads2,sizeof(float)*4>>>(d_sum,n,d_pi);
cutilCheckMsg("PI2 failed\n");
cudaMemcpy(&h_pi,d_pi,sizeof(float),cudaMemcpyDeviceToHost );
cutilCheckMsg("Memcpy failed\n");
cudaThreadSynchronize();
cutilCheckError( cutStopTimer( timer));
printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
cout<<endl;
cout<<"gpu result is:"<<h_pi<<endl;
cout<<"with "<<tn<<" per thread"<<" "<<endl;
cout<<"Acceleration ratio:"<<(t/1000) / cutGetTimerValue( timer);
cutilCheckError( cutDeleteTimer( timer));
}
cudaFree(d_sum);
cudaFree(d_pi);
return 0;
}
```