Hi,
The following code is my attempt to use a 2-element array F (stands for flag) in global memory to synchronize read and write from 2 blocks, and measure the time spent in transfer a 16KB shared memory in block 0 to global memory, and then read by another block, followed by a new writing.
I have it executed as a grid of 2 blocks and each block contains only 1 thread. I think the two blocks are allocated in 2 different multiprocessors, thus the shared memory array S are two different array (each S is 16KB). Because CUDA doesn’t support inter-block synchronization, I use flag F in the global memory to do the trick.
My question is:
It doesn’t work. The printout of array G(stands for global memory array) is always strange numbers.
Not the 2i or 2i+1 that I assigned to G in block 0 or block 1.
Could somebody help me figure out where is the bug in the program? I have spent days finding it without any luck.
Many thanks
Timtimac
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cutil.h>
#define SIZE 4000
__global__ void roundTrip(float *G, int *F) {
int bx=blockIdx.x;
int i;
__shared__ float S; // 16KB for each block
if (bx==0) {
// block 0: write data from __shared__ memory S to __global__ memory G
for (i=0;i<SIZE;i++) {
S[i] = i*2+1;
G[i] = S[i];
}
// set flag F[0] to 1, meaning the writing is finished
F[0] = 1;
// waiting for flag F[1] to become 1, meaning it is ready to read
while (F[1]==0)
;
// block 0: read data (written by block 1) from global memory to shared memory
for (i=0;i<SIZE;i++)
S[i] = G[i];
} else if (bx==1) {
// block 1: wait for block 0 to finish writing
while (F[0]==0)
;
for (i=0;i<SIZE;i++) {
// read what has been written by block 0
S[i] = G[i];
// write new data to global memory
G[i] = i*2;
}
// set flag F[1] to 1, signal the writing is completed
F[1] =1;
}
}
int main(void) {
float *GD;
float G;
int *F;
int i;
int *FD;
unsigned int timer;
cudaMalloc((void**)&GD, sizeof(float)*SIZE);
cudaMemcpy(GD, G, sizeof(float)*SIZE, cudaMemcpyHostToDevice);
cudaMalloc((void**)&FD, sizeof(int)*2);
F = (int *)malloc(sizeof(int)*2);
F[0] = 0;
F[1] = 0;
cudaMemcpy(FD,F,sizeof(int)*2,cudaMemcpyHostToDevice);
dim3 dimBlock(1,1);
dim3 dimGrid(2,1);
CUT_SAFE_CALL(cutCreateTimer(&timer));
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStartTimer(timer));
roundTrip<<<dimGrid,dimBlock>>>(GD, FD);
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(timer));
printf("Time (in ms): %f\n", cutGetTimerValue(timer));
cudaMemcpy(G,GD,sizeof(float)*SIZE,cudaMemcpyDeviceToHost);
for (i=0; i <SIZE; i++){
printf("%.1f ", G[i]);
if (i%15==0) printf("\n");
}
cudaFree(GD);
cudaFree(FD);
return 0;
}