Hi
my main() function is not really important, only has some allocations for input and output buffers on Host. Below is the important stuff:
[b]my Kernel:
[/b]
global void addBias_kernel(float* output, float* input, float* bias)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < NUM_OF_SENSORS) && (y < NUM_OF_SLICES))
{
int i = y * NUM_OF_SENSORS + x;
output[i] = sinf(input[i]) + atan(bias[i]); // The sinf() and atan() is just for testing of kernel that takes longer
}
}
[font=“Consolas”][font=“Consolas”]
My CUDA function (important stuff colored in dark blue):
cudaError_t cudaBiasStreams(float* h_input, float* h_output, float* h_bias)
{
cudaError_t cudaStatus;
float *d_input, *d_output, *d_bias;
int i;
cudaStream_t cStream[NUM_OF_STREAMS];
const int singleViewSize = NUM_OF_SENSORS * NUM_OF_SLICES;
const int totalDataSize = singleViewSize * NUM_OF_VIEWS;[/font][/font]
[font=“Consolas”][font=“Consolas”] // Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}[/font][/font]
[font=“Consolas”][font=“Consolas”] for(i=0 ; i<NUM_OF_STREAMS ; i++)
{
cudaStatus = cudaStreamCreate(&cStream[i]);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaStreamCreate failed!”);
goto Error;
}
}[/font][/font]
[font=“Consolas”][font=“Consolas”] cudaStatus = cudaMalloc((void**)&d_bias, singleViewSize * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}[/font][/font]
[font=“Consolas”][font=“Consolas”] cudaStatus = cudaMemcpy(d_bias, h_bias, singleViewSize * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}[/font][/font]
[font=“Consolas”][font=“Consolas”] cudaStatus = cudaMalloc((void**)&d_input, totalDataSize * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}[/font][/font]
[font=“Consolas”][font=“Consolas”] cudaStatus = cudaMalloc((void**)&d_output, totalDataSize * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}[/font][/font]
[font=“Consolas”][font=“Consolas”] for (i=0 ; i<NUM_OF_VIEWS ; i++)
{
cudaMemcpyAsync(&d_input[i * singleViewSize], &h_input[i * singleViewSize], singleViewSize * sizeof(float), cudaMemcpyHostToDevice, cStream[i % NUM_OF_STREAMS]);[/font][/font]
[font=“Consolas”][font=“Consolas”] dim3 threads(16, 16, 1);
dim3 blocks;
blocks.x = iDivUp(NUM_OF_SENSORS, threads.x);
blocks.y = iDivUp(NUM_OF_SLICES, threads.y);
blocks.z = 1;[/font][/font]
[font=“Consolas”][font=“Consolas”] addBias_kernel<<<blocks, threads, 0, cStream[i % NUM_OF_STREAMS]>>> (&d_output[i * singleViewSize], &d_input[i * singleViewSize], d_bias);[/font][/font]
[font=“Consolas”][font=“Consolas”]/* cudaMemcpyAsync(&h_output[i * singleViewSize], // WILL NOT ALOW PARALLEL WORK !!! becasue I do not have “Dual Copy Engine”
&d_output[i * singleViewSize],
singleViewSize * sizeof(float),
cudaMemcpyDeviceToHost,
cStream[i % NUM_OF_STREAMS]);
*/
}[/font][/font]
[font=“Consolas”][font=“Consolas”] cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize failed!");
goto Error;
}[/font][/font]
[font=“Consolas”][font=“Consolas”]Error:
cudaFree(d_bias);
cudaFree(d_input);
cudaFree(d_output);[/font][/font]
[font=“Consolas”][font=“Consolas”] for(i=0 ; i<NUM_OF_STREAMS ; i++)
{
cudaStatus = cudaStreamDestroy(cStream[i]);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaStreamDestroy failed!”);
goto Error;
}
}[/font][/font]
[font=“Consolas”][font=“Consolas”] cudaDeviceReset();[/font][/font]
[font=“Consolas”][font=“Consolas”] return cudaStatus;
}
[b]Additional suff:
[/b]
inline int iDivUp(int a, int b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
}
#define NUM_OF_SENSORS (688)
#define NUM_OF_SLICES (192*2)
#define NUM_OF_VIEWS (200)[/font][/font]
[font=“Consolas”][font=“Consolas”]#define NUM_OF_STREAMS (7)
[/font][/font]