Variable global

Hi people, I don’t know how to use the variable global. Example:

#include <stdio.h>

__device__ int *var;

__global__ void kernel(){

        printf("VAR=%d\n,*var");

}

int main(){

        cudaMalloc( (void**)&var,sizeof(int) );

        cudaMemset(var,0,sizeof(int));

        kernel<<<1,10>>>();

        cudaThreadSynchronize();

        return 0;

}

Why don’t print for 10 times the values of “var” that is 0?

Is that right? What’s Your output?

Regards,

MK

nothing!!! External Image Don’t print never!

Consider this code instead the printf call:

printf("VAR=%d\n",*var); // You had " in the wrong place

Helped?

Regrads,

MK

This code:

#include <stdio.h>

__device__ int *var;

__global__ void kernel(){

        printf("VAR=%d\n",var);

}

int main(){

        cudaMalloc( (void**)&var,sizeof(int) );

        cudaMemset(var,1,sizeof(int));

        kernel<<<1,10>>>();

        cudaThreadSynchronize();

        return 0;

}

print for 10 time “VAR=0”, not “VAR=1”. Why?

Resolved!

Now I want know another thing: how do I do for not to create a race conditions between threads? Code:

#include <stdio.h>

__device__ int var=0;

__global__ void kernel(){

        __syncthreads();

        atomicAdd(&var,1);

        printf("VAR=%d\n",var);

        __syncthreads();

}

int main(){

        //cudaMalloc( (void**)&var,sizeof(int) );

        //cudaMemset(var,1,sizeof(int));

        kernel<<<1,10>>>();

        cudaThreadSynchronize();

        return 0;

}

print:

VAR=10

VAR=10

VAR=10

VAR=10

VAR=10

VAR=10

VAR=10

VAR=10

VAR=10

VAR=10

instead I want that print

VAR=1

VAR=2

VAR=3

VAR=4

VAR=5

VAR=6

VAR=7

VAR=8

VAR=9

VAR=10

How must I do?

All threads do the same, thus every one of them adds 1 to ‘var’ and then prints. Because the adding is done as atomic operation value of ‘var’ when printing is 10. To get the output You want both adding and printing should be done in, one atomic operation, which, as far as I know, cannot be done (simply, but You could consider implementing a kind of locking). Such output can be generated by printing thread index plus one, isn’t it?

Regards,
MK

P.S.
Are those __syncthreads() needed here?

My problem is of don’t create race conditions in this part of kernel:

if(massa>M0-del && massa<M0+del){

                        VettRis[Ind].x=Candidato.x;

                        VettRis[Ind].y=Candidato.y;

                        VettRis[Ind].z=Candidato.z;

                        VettRis[Ind].Ene=Candidato.Ene;

                        VettRis[Ind].qv1=i;

                        VettRis[Ind].qv2=j;

                        atomicAdd(&Ind,1);

                        printf("IND=%d\n",Ind);

                }

where Ind is a global variable definited in this way:

device int Ind=0;

How to do?

I suppose that ‘Ind’ is to be incremented every time the above condition is fulfiled. Depending on the number of threads (N) for which the ‘if’ is true, ‘Ind’ will be incremented by 1, N times. Say ‘Ind’ has value 3 before entering the ‘if’. For all threads that enter it ‘Ind’ has the same value. Assume that N = 7 threads enter the ‘if’. Thus after call to ‘atomicAdd’, ‘Ind’ value will be:

Ind = Ind + N * 1 = 3 + 7 * 1 = 10. The printf will produce such output. After ‘if’ executes the value of ‘Ind’ will be 10.

I’m not quite sure what does this code is for, but it doesn’t matter. I think that using a global variable in such a manner, is not a good idea. Consider using shared memory maybe?

Hope I helped,

MK

My troble is that I must copy in the VettRis, in DIFFERENT POSITION INDICATE FROM “Ind”, only the values that pass the if condition but the threads enter in the same time and write in the same potition (the 0). Can I do a semphores?

Try this or that. And a one more

I also do a similar mode of the first link:

if(massa>M0-del && massa<M0+del){

while( (atomicCAS(&flag,0,1))!=1 );

                        printf("IND=%d\n",Ind);

                        Pi0[Ind].x=Candidato.x;

                        Pi0[Ind].y=Candidato.y;

                        Pi0[Ind].z=Candidato.z;

                        Pi0[Ind].Ene=Candidato.Ene;

                        Pi0[Ind].g1=i;

                        Pi0[Ind].g2=j;

                        Ind++;

                        atomicExch(&flag,0);

                }

but froze the system. Isn’t there no mode to do this?

To increment ‘Ind’ without data loss or dead locking, You could copy it to shared memory for each thread entering the ‘if’, synchronize them (after ‘if’), and then sum what You copied to shared memory. For those threads that did not enter the ‘if’ shared memory should be initialized to 0. Do the summing in a local variable first, then ‘atomicAdd’ it to global ‘Ind’. I provide code that does what I described, for better understanding what I mean.

#define THREADS_IN_BLOCK 16

#define BLOCKS_IN_GRID	1

__device__ int Ind = 0;

__global__ void kernel() 

{

	__shared__ int s_Ind[THREADS_IN_BLOCK];	// the more, smaller blocks, the better (?)

	int tid = threadIdx.x;	// only if the block is one-dimentional

	int l_Ind = 0;

	s_Ind[tid] = 0;

	if (tid > -1 && tid < 5)	// do Your if... (here 5 thread enter the 'if')

	{

		s_Ind[tid] = Ind;

		// do Your work...

		s_Ind[tid] += 1;

	}

	__syncthreads();

	for (int i = 0; i < THREADS_IN_BLOCK; ++i) l_Ind += s_Ind[i];

	if (!tid) atomicAdd(&Ind, l_Ind);      // my bad - too much adding was applied here...

	printf("%d: s_Ind[%d] = %d, l_Ind = %d, Ind = %d\n", tid, tid, s_Ind[tid], l_Ind, Ind);

	// s_Ind = 5, l_Ind = [0 or 1, depending on tid], Ind = l_Ind + Ind = 5

}

int main()

{

	cudaSetDevice(0);

	kernel<<<BLOCKS_IN_GRID, THREADS_IN_BLOCK>>>();

	cudaThreadSynchronize();

	getchar();

	return 0;

}

Output of the code above in the attachement.

Ind.png

Regards,

MK

[I don’t know why my previous post disappeared from the thread. Trying again…]

Try this code:

#include <stdio.h>

__device__ int var=0;

__global__ void kernel(){

        int temp = atomicAdd(&var,1);

        printf("VAR=%d\n",temp);

}

int main(){

        kernel<<<1,10>>>();

        cudaThreadSynchronize();

        return 0;

}

It should print numbers from 0 to 9, although no particular order is guaranteed.

cmaster.matso: I don’t see how your code is going to work reliably with more than one block.

tera: Then could You tell me what makes my code unreliable with more then one block (provided the code before my correction in the previous post, beyond the fact that I could have misunderstood something)?

Bless You,
MK

Nothing prevents two different blocks from reading and using the same value of [font=“Courier New”]Ind[/font] before the atomic adds are performed.

OK - I reviewed my code and tera is right. Sorry for misleading You. I still have much to learn myself External Image

Regards,
MK

P.S.

tera: Could such usage of ‘atomicAdd’ cause longer time of execution in case of high number of threads incrementing ‘Ind’?

Yes, you can reduce contention for the global variable by having just one thread per block accessing it. Something like this should work for one-dimensional blocks:

#include <stdio.h>

__device__ int var=0;

__global__ void kernel(){

        __shared__ int svar;

if (threadIdx.x==0)

                svar = atomicAdd(&var,blockDim.x);

        __syncthreads();

        printf("VAR=%d\n",svar+threadIdx.x);

}

int main(){

        kernel<<<3,5>>>();

        cudaThreadSynchronize();

        return 0;

}