gpu_sync Problem with interblock synchronisation (gpu_sync)

Eric_G · March 1, 2010, 10:02am

Has anyone tried the gpu_sync functions described in Inter-Block GPU Communication via Fast Barrier Synchronization, Shucai Xiao and Wu-chun Feng, Department of Computer Science, Virginia Tech? The code described in this article seems perfectly alright to me, but in the code below, __gpu_sync() does not stop the threads as it should do. I must be missing something, easily, but help!!.. :"> Thanks!

[codebox]#include <cutil.h>

const int N = 8; // longueur de la chaine

// the mutex variable

device int g_mutex = 0;

device void __gpu_sync(int goalVal, int tid_in_block, int *ad){

int u;

if (tid_in_block == 0) {

    atomicAdd(&g_mutex, 1);

}

if (tid_in_block == 0) 

  ad[blockIdx.y]=(blockIdx.y);

while (g_mutex < goalVal) {

  u = 0;

}

__syncthreads();

}

global void boucleGPU(int *ad, int *cd){

  __gpu_sync(N*N, threadIdx.x,ad);  

  cd[blockIdx.y] = g_mutex;

}

// fonction de calcul sur GPU

host int GPU_calc(){

int *a,*c,*cd,*ad;

const int size = N*sizeof(int);

cudaMalloc((void**)&cd, size);

cudaMalloc((void **)&ad,size);

a = (int *) malloc(size);

c = (int *) malloc(size);

// on copie les donnÃ©es sur le GPU

dim3 dimB(1,1);

dim3 dimG(1, N);

boucleGPU<<<dimG, dimB>>>(ad,cd);

cudaThreadSynchronize();

cudaMemcpy(c, cd, size, cudaMemcpyDeviceToHost);

cudaMemcpy(a, ad, size, cudaMemcpyDeviceToHost);

// affichage

for (int i = 0; i < N; i++){

  printf("a[%d]=%d, c[%d]=%d ", i,a[i],i,c[i]);

}

printf("\n");

// liberation de la mÃ©moire

cudaFree(cd);

cudaFree(ad);

free(a);

free©;

return EXIT_SUCCESS;

}

int main(){

GPU_calc();

return 0;

}

[/codebox]

andradx · March 17, 2010, 12:34pm

Has anyone tried the gpu_sync functions described in Inter-Block GPU Communication via Fast Barrier Synchronization, Shucai Xiao and Wu-chun Feng, Department of Computer Science, Virginia Tech? The code described in this article seems perfectly alright to me, but in the code below, __gpu_sync() does not stop the threads as it should do. I must be missing something, easily, but help!!.. :"> Thanks!

[codebox]include <cutil.h>

const int N = 8; // longueur de la chaine

// the mutex variable

device int g_mutex = 0;

device void __gpu_sync(int goalVal, int tid_in_block, int *ad){
int u;

if (tid_in_block == 0) {

    atomicAdd(&g_mutex, 1);

}

if (tid_in_block == 0) 

  ad[blockIdx.y]=(blockIdx.y);

while (g_mutex < goalVal) {

  u = 0;

}

__syncthreads();
}

global void boucleGPU(int *ad, int *cd){
  __gpu_sync(N*N, threadIdx.x,ad);  

  cd[blockIdx.y] = g_mutex;
}

// fonction de calcul sur GPU

host int GPU_calc(){
int *a,*c,*cd,*ad;

const int size = N*sizeof(int);

cudaMalloc((void**)&cd, size);

cudaMalloc((void **)&ad,size);

a = (int *) malloc(size);

c = (int *) malloc(size);
// on copie les donnÃ©es sur le GPU
dim3 dimB(1,1);

dim3 dimG(1, N);
boucleGPU<<<dimG, dimB>>>(ad,cd);
cudaThreadSynchronize();
cudaMemcpy(c, cd, size, cudaMemcpyDeviceToHost);
cudaMemcpy(a, ad, size, cudaMemcpyDeviceToHost);
// affichage
for (int i = 0; i < N; i++){

  printf("a[%d]=%d, c[%d]=%d ", i,a[i],i,c[i]);

}

printf("\n");
// liberation de la mÃ©moire
cudaFree(cd);

cudaFree(ad);

free(a);

freeÂ©;

return EXIT_SUCCESS;
}

int main(){

GPU_calc();

return 0;

}

[/codebox]

There’s no need in using a tid_in_block as argument to __gpy_sync as threadIdx.x identifies it automatically, in your case with a 1D grid threadIdx.x is enough. Second of all the goalVal would be N and not NxN as you have defined N blocks on your grid, n’est ce pas? Just throwing out here, I came across this function and was just studying it as I replied.

rama365 · December 7, 2012, 10:19pm

Also, don’t forget to reset your g_mutex back to 0.

I am trying to get this working also. My result is GPU freeze, which I suspect is due to g_mutex < goalVal never stopping.