Has anyone tried the gpu_sync functions described in Inter-Block GPU Communication via Fast Barrier Synchronization, Shucai Xiao and Wu-chun Feng, Department of Computer Science, Virginia Tech? The code described in this article seems perfectly alright to me, but in the code below, __gpu_sync() does not stop the threads as it should do. I must be missing something, easily, but help!!.. :"> Thanks!
[codebox]#include <cutil.h>
const int N = 8; // longueur de la chaine
// the mutex variable
device int g_mutex = 0;
device void __gpu_sync(int goalVal, int tid_in_block, int *ad){
int u;
if (tid_in_block == 0) {
atomicAdd(&g_mutex, 1);
}
if (tid_in_block == 0)
ad[blockIdx.y]=(blockIdx.y);
while (g_mutex < goalVal) {
u = 0;
}
__syncthreads();
}
global void boucleGPU(int *ad, int *cd){
__gpu_sync(N*N, threadIdx.x,ad);
cd[blockIdx.y] = g_mutex;
}
// fonction de calcul sur GPU
host int GPU_calc(){
int *a,*c,*cd,*ad;
const int size = N*sizeof(int);
cudaMalloc((void**)&cd, size);
cudaMalloc((void **)&ad,size);
a = (int *) malloc(size);
c = (int *) malloc(size);
// on copie les données sur le GPU
dim3 dimB(1,1);
dim3 dimG(1, N);
boucleGPU<<<dimG, dimB>>>(ad,cd);
cudaThreadSynchronize();
cudaMemcpy(c, cd, size, cudaMemcpyDeviceToHost);
cudaMemcpy(a, ad, size, cudaMemcpyDeviceToHost);
// affichage
for (int i = 0; i < N; i++){
printf("a[%d]=%d, c[%d]=%d ", i,a[i],i,c[i]);
}
printf("\n");
// liberation de la mémoire
cudaFree(cd);
cudaFree(ad);
free(a);
free©;
return EXIT_SUCCESS;
}
int main(){
GPU_calc();
return 0;
}
[/codebox]