I’m running the same function on every thread, but with different input parameters. To speed up this function, I declare the variables in the function as shared. Because shared memory is shared within a block, I worry that the function calls interfere with each other and the shared variables could be overwritten. However, the example below seems to work correctly. Is this actual legal use of shared variables? Does every call to that function reserve a separate copy of that variable or am I just lucky?
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
device void BGC_MODEL(float bgc_swr, float *output)
{
const int bgc_timesteps=20;
shared float topo4;
shared int it;
topo4 = 0.;
for(it=1;it<=bgc_timesteps;it++)
topo4 += 0.05*bgc_swr;
*output = topo4;
}
global
void wrapper(int nn, float *bgc_swr, float *output)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i>=0 && i<nn)
BGC_MODEL(bgc_swr[i],&output[i]);
}
host
int main() {
int i, blcks;
const int nn=1000;
const int threadsperblock=32;
size_t size = nn*sizeof(float);
float* bgc_swr = (float*)malloc(size);
float* output = (float*)malloc(size);
for (i=0;i<nn;i++) {
bgc_swr[i]=(float)i;
}
float* bgc_swr_DEV; cudaMalloc(&bgc_swr_DEV, size);
float* output_DEV; cudaMalloc(&output_DEV, size);
blcks = (nn+threadsperblock-1)/threadsperblock;
cudaMemcpy(bgc_swr_DEV,bgc_swr, size, cudaMemcpyHostToDevice);
wrapper<<<blcks, threadsperblock>>>(nn, bgc_swr_DEV, output_DEV);
cudaMemcpy(output, output_DEV, size, cudaMemcpyDeviceToHost);
for (i=0;i<nn;i+=100)
printf(“%d %f\n”,output[i]);
return 0;
}