Briefly:
I want to iterate
new[i][j]=(old[i+1][j]+old[i-1][j]+old[i][j+1]+old[i][j-1]-edge[i][j])/4
I did it using global memory (not worrying about coalescing or anything), but it is not much faster than a serial CPU version. Obtaining maximum performance is my goal.
How would you attempt this?
I am considering using texture memory to improve performance, would this be a good choice?
Thank you!
If you want to read it, here is a fuller description and code snippet:
I am writing a program to do the following:
-
I have a 2D (N x M) array ‘buf_h’
-
I copy this to ‘buf’ on device
-
I allocate (N+2) x (M+2) size blocks of memory on device called ‘old’, ‘newa’, and ‘edge’
-
‘edge’ and ‘old’ start-out as ‘buf’ surrounded by one layer of '0’s.
-
The following is iterated ITS times for 0<i,j<N+1:
5a. new[i][j]=(old[i+1][j]+old[i-1][j]+old[i][j+1]+old[i][j-1]-edge[i][j])/4
5b. old[i][j]=new[i][j]
-
‘buf’ is copied to ‘buf_h’
...
main(){
...
cudaMalloc((void **) &buf, sizeof(float)*N*M);
cudaMalloc((void **) &old, sizeof(float)*(N+2)*(M+2));
...
cudaMemcpy(buf, buf_h, sizeof(float)*N*M, cudaMemcpyHostToDevice);
cudaMemset(edge, 0, sizeof(float)*(N+2)*(M+2));
...
load<<<grid, block>>>(buf, edge, old);
cudaThreadSynchronize();
for(it=0;it<ITS;it++){
edgeker<<<grid, block>>>(old, edge, newa);
cudaThreadSynchronize();
}
finish<<<grid, block>>>(buf, old);
cudaThreadSynchronize();
cudaMemcpy(buf_h, buf, sizeof(float)*N*M, cudaMemcpyDeviceToHost);
...
}
void load(float *buf, float *edge, float *old){
int el=(threadIdx.x+blockIdx.y*M), corr=M+3+2*blockIdx.y;
*(edge + el+corr)=*(buf + el);
*(old + el+corr)=*(buf + el);
}
void edgeker(float *old, float *edge, float *newa){
float *newacorr=newa + threadIdx.x+blockIdx.y*M+M+3+2*blockIdx.y;
float *oldcorr=old + threadIdx.x+blockIdx.y*M+M+3+2*blockIdx.y;
float *edgecorr=edge + threadIdx.x+blockIdx.y*M+M+3+2*blockIdx.y;
*(newacorr)=(*(oldcorr + 1) + *(oldcorr - 1) + *(oldcorr + (M+2)) + *(oldcorr - (M+2)) - *(edgecorr))/4;
*(oldcorr)=*(newacorr);
}
void finish(float *buf, float *old){
int el=(threadIdx.x+blockIdx.y*M), corr=M+3+2*blockIdx.y;
*(buf+el)=*(old+el+corr);
}