why result varied based on different number of threads per block?

I calculated PI with integral. Code is the same, but number of threads per block was different. I found the it was really strange that PI is 3.13903 when I set 256 threads

per block.

Can any one explain why?

Besides, if I changed dimension of blocks1 to (8,2,1), result will be totally wrong. WHY?

the cpu result is:3.14159

time elapsed:16.177

CUDA initialized.

Processing time: 0.732636 (ms)

gpu result is:3.14159

with 32 per thread

Acceleration ratio:22.0805Processing time: 0.486263 (ms)

gpu result is:3.14159

with 64 per thread

Acceleration ratio:33.268Processing time: 0.397520 (ms)

gpu result is:3.14159

with 128 per thread

Acceleration ratio:40.6948Processing time: 0.367128 (ms)

gpu result is:3.13903

with 256 per thread

Acceleration ratio:44.0636Processing time: 0.361455 (ms)

gpu result is:3.14159

with 512 per thread

Acceleration ratio:44.7552Processing time: 0.480590 (ms)

gpu result is:3.14159

with 1024 per thread

Acceleration ratio:33.6607

here is my code:

/********************************************************************

*  sample.cu

*  This is a example of the CUDA program.

*********************************************************************/

#include <stdio.h>

#include <stdlib.h>

#include <cutil_inline.h>

#include <iostream>

#include <windows.h>

using namespace std;

/************************************************************************/

/* Init CUDA                                                            */

/************************************************************************/

#if __DEVICE_EMULATION__

bool InitCUDA(void){return true;}

#else

bool InitCUDA(void)

{

	int count = 0;

	int i = 0;

	cudaGetDeviceCount(&count);

	if(count == 0) {

		fprintf(stderr, "There is no device.\n");

		return false;

	}

	for(i = 0; i < count; i++) {

		cudaDeviceProp prop;

		if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

			if(prop.major >= 1) {

				break;

			}

		}

	}

	if(i == count) {

		fprintf(stderr, "There is no device supporting CUDA.\n");

		return false;

	}

	cudaSetDevice(i);

	printf("CUDA initialized.\n");

	return true;

}

#endif

double cpuPI(int num){ 

	double sum=0.0f; 

	double temp; 

	for(int i=0;i<num;i++){ 

		temp=(i+0.5f)/num; 

		//  printf ("%f\n", temp); 

		sum+=4/(1+temp*temp); 

		//  printf ("%f\n", sum); 

	} 

	return sum/num; 

} 

__global__ void reducePI1 (float *d_sum, int num) { 

	int id=blockIdx.x*blockDim.x+threadIdx.x;//线程索引 

	int gid=id; 

	float temp; 

	extern float __shared__ s_pi[];//

	s_pi[threadIdx.x]=0.0f; 

	while(gid<num){ 

		temp=(gid+0.5f)/num;// 

		s_pi[threadIdx.x]+=4.0f/(1+temp*temp); 

		gid+=blockDim.x*gridDim.x; 

	} 

	for(int i=(blockDim.x>>1);i>0;i>>=1){ 

		if(threadIdx.x<i){ 

			s_pi[threadIdx.x]+=s_pi[threadIdx.x+i]; 

		} 

		__syncthreads(); //

	} 

	if(threadIdx.x==0) 

		d_sum[blockIdx.x]=s_pi[0]; 

} 

__global__ void reducePI2(float *d_sum,int num,float *d_pi){ 

	int id=threadIdx.x; 

	extern float __shared__ s_sum[]; 

	s_sum[id]=d_sum[id]; 

	__syncthreads(); 

	for(int i=(blockDim.x>>1);i>0;i>>=1){ 

		if(id<i) 

			s_sum[id]+=s_sum[id+i]; 

		__syncthreads(); 

	} 

	//  printf("%d,%f\n",id,s_sum[id]); 

	if(id==0){ 

		*d_pi=s_sum[0]/num; 

		//  printf("%d,%f\n",id,*pi); 

	} 

} 

int main(int argc, char* argv[])

{

	DWORD t_S,t_E;

	float t;

	int n=100000;

	t_S=GetTickCount();

	cout<<"the cpu result is:";

	cout<<cpuPI(1000000000)<<endl;;

	t_E=GetTickCount();

	t=t_E-t_S;

	cout<<"time elapsed:"<<(float)t/1000<<endl;

	if(!InitCUDA()) {

		return 0;

	}

	float *d_sum;

	float *d_pi;

	float h_pi=1;

	cudaMalloc(&d_sum,sizeof(float)*4);

	cudaMalloc(&d_pi,sizeof(float));

	int tn=16;

	dim3 threads1(tn,1,1);

	dim3 blocks1(2,2,1);// here to change dimension of blocks1

	dim3 blocks2(1,1,1);

	dim3 threads2(4,1,1);

	//test

	for(tn=32;tn<=1024;tn*=2)

	{

		threads1.x=tn;

		unsigned int timer = 0;

		cutilCheckError( cutCreateTimer( &timer));

		cutilCheckError( cutStartTimer( timer));

		reducePI1<<<blocks1,threads1,sizeof(float)*tn>>>(d_sum,n);

		cutilCheckMsg("PI1 failed\n");

		reducePI2<<<blocks2,threads2,sizeof(float)*4>>>(d_sum,n,d_pi);

		cutilCheckMsg("PI2 failed\n");

		cudaMemcpy(&h_pi,d_pi,sizeof(float),cudaMemcpyDeviceToHost );

		cutilCheckMsg("Memcpy failed\n");

		

		cudaThreadSynchronize();

		cutilCheckError( cutStopTimer( timer));

		printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));

		cout<<endl;

		cout<<"gpu result is:"<<h_pi<<endl;

		cout<<"with "<<tn<<" per thread"<<"   "<<endl;

		cout<<"Acceleration ratio:"<<(t/1000) / cutGetTimerValue( timer);

		cutilCheckError( cutDeleteTimer( timer));

	}

	cudaFree(d_sum);

	cudaFree(d_pi);

	return 0;

}

HI

I don’t know if it’s your problem but

I know you can not use share memory variables

as extern, because they have a kernell scope

(see programming manual)

I think too that you could have some sort

of “race condition” in the varialbe s_pi

in this code:

for(int i=(blockDim.x>>1);i>0;i>>=1){ 

                if(threadIdx.x<i){ 

                        s_pi[threadIdx.x]+=s_pi[threadIdx.x+i]; 

                }

and in the variale s_sum in this code:

for(int i=(blockDim.x>>1);i>0;i>>=1){ 

                if(id<i) 

                        s_sum[id]+=s_sum[id+i]; 

                __syncthreads(); 

         }

when id is a thread id belonging to a warp

and id+1 is thread id belonging to the following warp.

In this case the pseudo casual order of warp execution

makes that sums inconsistent.

I think you should fix these points.

Rocco

You do not know that, because it is not correct. See Section B.2.3 of the very same CUDA programming guide:

sorry for my forgetfulness, shared memory has

a shorter scope of a thread block but I know what I said:

in this code an array of extern shared memory have no sense

and a simple shared definition is correct and sufficient

its content is not consistent every time

a new thread block (and a new kernel too) is executed

extern shared memory has really sense inside

a device subfunction called by another global function

in the same thread block to share/pass the same data.

so, why does he use “extern”?

Rocco

Read the part of the programming guide I quoted above. Extern declarations of the type you said are illegal allow shared memory allocation size to be determined at run time and not compile time.

Sorry but I’m not sure you understood what I meant
…for me extern shared memory have no sense here
with this code it not possible to understend why he use extern
and so why he have/want to use a dinamic runtime allocation
of shared memory

But the problem here is not the shared memory but a race condition,
as I wrote, so stop talking about shared memory here

I wanted only to warn about the use of extern in shared memory
to avoid inconsistence in data with a possible successive use of
extern shared memory…nothing more

but if you like to write a lot in these forums
here there is my problem :)

http://forums.nvidia.com/index.php?showtopic=193417

any suggestion or confirmation
will be very appreciate

thank you however in advance

Because the kernels are called with different block sizes and the shared memory buffers must be sized accordingly.

Neither of those two pieces of code you mentioned contain a memory race for the execution parameters used in the code. The real reason is the lack of a synchronization barrier in one place in the first kernel call, between the accumulation and shared memory reduction phases of the kernel. If the first kernel is rewritten like this:

__global__ void reducePI1 (float *d_sum, int num) { 

        int id=blockIdx.x*blockDim.x+threadIdx.x; 

        int gid=id; 

        float temp; 

        extern float __shared__ s_pi[];//

        s_pi[threadIdx.x]=0.0f; 

// Accumulate

        while(gid<num){ 

                temp=(gid+0.5f)/num;// 

                s_pi[threadIdx.x]+=4.0f/(1+temp*temp); 

                gid+=blockDim.x*gridDim.x; 

        }

// Ensure all accumulation is finished

	__syncthreads();

// parallel reduction

        for(int i=(blockDim.x>>1);i>0;i>>=1){ 

                if(threadIdx.x<i){ 

                        s_pi[threadIdx.x]+=s_pi[threadIdx.x+i]; 

                } 

                __syncthreads(); //

        } 

        if(threadIdx.x==0) 

                d_sum[blockIdx.x]=s_pi[0]; 

}

it will work correctly (at least up to the useful accuracy limit of single precision floating point). The reason why it can fail now is because any given warp may begin the parallel reduction while another has yet to finish the accumulation phase of the code.

Which is also total nonsense. In the context of shared memory declarations, extern doesn’t mean what you keep incorrectly asserting it means. It only controls whether shared memory can be allocated at runtime or not. No other behaviour changes occur whether the shared memory is declared extern or not.

I know this, I’m trying to explain that this dynamic use of threads

is not necessary and then …use of dynamic allocation

of shared memory has no sense

With more threads in a block (>128) the algoritm is not faster

because the NVIDIA warp scheduler is really fantastic

and it does not introduce latency stoping and starting warps and blocks

executions on a SM.

Generally using more threads per blocks there are less resources per thread

depends on your algorithm

maybe you dont know but this condition you are describing is called “race condition” :)

http://en.wikipedia.org/wiki/Race_condition

and your code here is missing at least of one __syncthread() :)

…it is really correct but…

…behaviour changes using extern shared memory erroneously

such as … sharing data between blocks

only for this reason I wrote my warning although this is not

the correct situation.

As you know, in C/C++, extern declaration is used exactly for sharing

data e.g. between functions and threads in multithread developing environment

I’m asking why our friend facat has not replied yet… :)

…did you read my post about a texture problem?? :)

I see. If I look at this simplified version of the original posters code:

#include <cmath>

#include <iostream>

using namespace std;

double cpuPI(long num)

{ 

    double sum=0.0f; 

#pragma omp parallel

    {

#pragma omp for reduction(+:sum)

        for(long i=0;i<num;i++){ 

            double temp = (double(i) + 0.5)/double(num); 

            sum+= 4./(1.+temp*temp); 

        } 

    }

    return sum/num; 

} 

extern float __shared__ buffer[];

__global__ void reducePI1 (float *d_sum, long num)

{ 

    volatile float * s_pi = &buffer[threadIdx.x];

    long gid=blockIdx.x*blockDim.x+threadIdx.x; 

    long stride = blockDim.x * gridDim.x;

    *s_pi = 0.f;

    while(gid<num){ 

        float temp=(gid+0.5f)/float(num);

        *s_pi += 4.0f / (1.f + temp*temp); 

        gid += stride; 

    } 

    __syncthreads();

    for(int i=(blockDim.x>>1);i>0;i>>=1){ 

        if(threadIdx.x<i) *s_pi += *(s_pi + i); 

        __syncthreads();

    } 

    if(threadIdx.x==0) 

        d_sum[blockIdx.x + gridDim.x * blockIdx.y] = *s_pi; 

} 

__global__ void reducePI2(float *d_sum,long num,float *d_pi){ 

    volatile float *s_sum = &buffer[threadIdx.x]; 

    int id=threadIdx.x; 

    *s_sum =d_sum[id]; 

    __syncthreads(); 

    for(int i=(blockDim.x>>1);i>0;i>>=1){ 

        if(id<i) *s_sum += *(s_sum+i); 

        __syncthreads(); 

    } 

    if(id==0) 

        *d_pi = *s_sum/float(num); 

} 

int main(int argc, char* argv[])

{

    const long n=8000000l;

    double h_pi_ref = cpuPI(n);

    cout.precision(10);

    cout<<"For n="<<n<<endl;

    cout<< "CPU result is "<<h_pi_ref<<endl;

float *d_sum, *d_pi;

    dim3 blocks1(4,1,1), blocks2(1,1,1);

    cudaMalloc(&d_sum,sizeof(float)*size_t(blocks1.x));

    cudaMalloc(&d_pi,sizeof(float));

cout << "blocks = " << blocks1.x << endl;

    for(int tn=32;tn<=1024;tn*=2) {

        float h_pi = 0.f;

        dim3 threads1 = dim3(tn,1,1), threads2 = blocks1;

reducePI1<<<blocks1,threads1,sizeof(float)*tn>>>(d_sum,n);

        reducePI2<<<blocks2,threads2,sizeof(float)*size_t(threads2.x)>>>(d_sum,n,d_pi);

        cudaMemcpy(&h_pi,d_pi,sizeof(float),cudaMemcpyDeviceToHost );

double relerr = fabs(h_pi_ref-h_pi) / h_pi_ref;

        cout<<"threads = "<<tn<<", gpu result = "<<h_pi;

        cout<<", relative error = "<<relerr<<endl;

    }

return (int)cudaThreadExit();

}

which does this when compiled and run:

avidday@cuda:~$ nvcc -arch=sm_20 -Xptxas="-v" -Xcompiler="-O3 -fopenmp" samplePI.cu -o samplePI.exe 

ptxas info    : Compiling entry function '_Z9reducePI2PflS_' for 'sm_20'

ptxas info    : Used 11 registers, 56 bytes cmem[0], 4 bytes cmem[16]

ptxas info    : Compiling entry function '_Z9reducePI1Pfl' for 'sm_20'

ptxas info    : Used 22 registers, 48 bytes cmem[0], 4 bytes cmem[16]

avidday@cuda:~$ OMP_NUM_THREADS=4 ./samplePI.exe 

For n=8000000

CPU result is 3.141592654

blocks = 4

threads = 32, gpu result = 3.141587973, relative error = 1.489992282e-06

threads = 64, gpu result = 3.141591787, relative error = 2.757364295e-07

threads = 128, gpu result = 3.141592026, relative error = 1.998454387e-07

threads = 256, gpu result = 3.141592503, relative error = 4.806345714e-08

threads = 512, gpu result = 3.141592503, relative error = 4.806345714e-08

threads = 1024, gpu result = 3.141592503, relative error = 4.806345714e-08

and which the visual profiler tells me takes this long on a GTX470

Kernel          GPU time	BlockDim.x

------------------------------------------

reducePI1	66673.2		32

reducePI1	33338.3		64

reducePI1	16671.9		128

reducePI1	8616.7		256

reducePI1	5386.37		512

reducePI1	4960.54		1024

I am left somewhat confused. I would greatly appreciate it if you could point out the errors in the code, because clearly I have gaps in my understanding of how CUDA actually works. If I have understood your analysis of the code posted in this thread correctly, the following all should apply:

    The parallel shared memory reductions of each kernel contain memory races

    The first kernel requires an additional syncthreads() call to function correctly

    The use of externally declared shared memory is either illegal or somehow dangerous and unnecessary in this example

    Increasing the thread count per block beyond 128 should make no difference to kernel performance in this case.

but for some reason the output appears to contradict all of those points. What am I doing wrong?