Hi @MatColgrove I have created two GPU kernels such as gpu_kernel1 and gpu_kernel2. Also I have offloaded a function stream_demux1 on GPU using openacc asynchronously.
I am getting following warning when compiling my code using nvc++ compiler such as (nvc++ -acc -cuda cuda_openacc.cu)
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
“cuda_openacc.cu”, line 262: warning: CUDA C++ compilation is unsupported in nvc++; nvcc is the recommended CUDA compiler [cuda_compile]
gpu_kernel1<<<1, 1, 0, stream1>>>(gpu_input_buffer_1, gpu_input_buffer_2, stream_demux_input_buffer);
^
Remark: individual warnings can be suppressed with “–diag_suppress ”
“cuda_openacc.cu”, line 263: warning: CUDA C++ compilation is unsupported in nvc++; nvcc is the recommended CUDA compiler [cuda_compile]
gpu_kernel2<<<1, 1, 0, stream2>>>(gpu_output_buffer_1, gpu_output_buffer_2, stream_demux_output_buffer);
^
“cuda_openacc.cu”, line 279: warning: statement is unreachable [code_is_unreachable]
DeallocateGPUBuffers();
^
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Here is the example program in cuda_openacc.cu file
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#include < iostream>
#include <stdio.h>
#include <unistd.h>
#include <zmq.h>
#include < cmath>
#include < string>
#include < cstring>
#include < iomanip>
#include < sys/mman.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <openacc.h>
#include <device_launch_parameters.h>
using namespace std;
/****************MACROS DEFINITIONS************************************/
#define DEBUG 0
#define BUFFER_SIZE 64*1024 // 64kBytes
#define UWord8 unsigned char
#define UWord16 unsigned short
#define UWord32 unsigned int
#define LOCALHOST_IP string("tcp://127.0.0.1:")
// Fixed parameters for Stream Demux Block
#define LENGTH1 24 // Length of the first output
#define LENGTH2 4 // Length of the second output
#define TOTAL_LENGTH (LENGTH1 + LENGTH2) // Total length per cycle
#define VECTOR_LENGTH 1 // Single items, no vectors
#define BUFFER_FILLED 1
#define BUFFER_EMPTY 2
#define GPU_CLOSE 0
/**********************************************************************/
/*****************GLOBAL VARIABLES*************************************/
UWord8* gpu_input_buffer_1;
UWord8* gpu_input_buffer_2;
UWord8* stream_demux_input_buffer;
UWord8* stream_demux_output_buffer;
UWord8* gpu_output_buffer_1;
UWord8* gpu_output_buffer_2;
//Unified Memory Shared Between CPU & GPU
__device__ __managed__ volatile unsigned int gpu_input_buffer_1_flag = 0;
__device__ __managed__ volatile unsigned int gpu_input_buffer_2_flag = 0;
__device__ __managed__ volatile unsigned int gpu_output_buffer_1_flag = 0;
__device__ __managed__ volatile unsigned int gpu_output_buffer_2_flag = 0;
__device__ __managed__ volatile unsigned int gpu_input_buffer_1_size = 0;
__device__ __managed__ volatile unsigned int gpu_input_buffer_2_size = 0;
__device__ __managed__ volatile unsigned int gpu_output_buffer_1_size = 0;
__device__ __managed__ volatile unsigned int gpu_output_buffer_2_size = 0;
//GPU Memory
__device__ volatile unsigned int strm_dmx_input_buffer_flag = 0;
__device__ volatile unsigned int strm_dmx_input_buffer_size = 0;
__device__ volatile unsigned int strm_dmx_output_buffer_flag = 0;
__device__ volatile unsigned int strm_dmx_output_buffer_size = 0;
/**********************************************************************/
__device__ void gpuSleep(unsigned int cycles)
{
unsigned int start = clock(); // Get current clock cycle //675 Mhz
while (clock() - start < cycles) {
// Busy-wait
}
}
void stream_demux1(UWord8* stream_demux_input_buffer, UWord8* stream_demux_output_buffer)
{
size_t num_cycles = 0;
UWord32 done = 0;
printf("OpenACC Stream Demux Function r\n");
#pragma acc data present(strm_dmx_input_buffer_flag, strm_dmx_output_buffer_flag)
#pragma acc data present(strm_dmx_input_buffer_size, strm_dmx_output_buffer_size)
while(1)
{
if(strm_dmx_input_buffer_flag == BUFFER_FILLED)
{
do
{
// usleep(10);
}while(strm_dmx_output_buffer_flag == BUFFER_FILLED);
if(strm_dmx_output_buffer_flag == BUFFER_EMPTY)
{
num_cycles = strm_dmx_input_buffer_size / TOTAL_LENGTH;
for (size_t i = 0; i < num_cycles; i++)
{
memcpy(stream_demux_output_buffer + i * LENGTH1, stream_demux_input_buffer + i * TOTAL_LENGTH, LENGTH1);
// memcpy(output2 + i * LENGTH2, input + i * TOTAL_LENGTH + LENGTH1, LENGTH2);
}
strm_dmx_output_buffer_size = num_cycles * LENGTH1;
strm_dmx_output_buffer_flag = BUFFER_FILLED;
strm_dmx_input_buffer_flag = BUFFER_EMPTY;
}
}
else if (strm_dmx_input_buffer_flag == GPU_CLOSE)
{
strm_dmx_output_buffer_flag = GPU_CLOSE;
printf("Closing Stream Demux Function \r\n");
break;
}
else
{
// usleep(10);
}
}
}
// CUDA kernel declaration (without returning anything)
__global__ void gpu_kernel1(UWord8* gpu_input_buffer_1, UWord8* gpu_input_buffer_2, UWord8* stream_demux_input_buffer)
{
printf("GPU Kernel 1: Waiting for Buffer \r\n");
while (true)
{
if(gpu_input_buffer_1_flag == BUFFER_FILLED)
{
do
{
gpuSleep(675);
}while(strm_dmx_input_buffer_flag != BUFFER_EMPTY);
memcpy(stream_demux_input_buffer, gpu_input_buffer_1, gpu_input_buffer_1_size);
strm_dmx_input_buffer_size = gpu_input_buffer_1_size;
strm_dmx_input_buffer_flag = BUFFER_FILLED;
gpu_input_buffer_1_flag = BUFFER_EMPTY;
}
else if(gpu_input_buffer_2_flag == BUFFER_FILLED)
{
do
{
gpuSleep(675);
}while(strm_dmx_input_buffer_flag != BUFFER_EMPTY);
memcpy(stream_demux_input_buffer, gpu_input_buffer_2, gpu_input_buffer_2_size);
strm_dmx_input_buffer_size = gpu_input_buffer_2_size;
strm_dmx_input_buffer_flag = BUFFER_FILLED;
gpu_input_buffer_2_flag = BUFFER_EMPTY;
}
else if(gpu_input_buffer_1_flag == GPU_CLOSE || gpu_input_buffer_2_flag == GPU_CLOSE)
{
printf("Closing GPU Kernel 1 Thread \r\n");
strm_dmx_input_buffer_flag = GPU_CLOSE;
break;
}
else
{
gpuSleep(337837); // Sleep for 500usec // GPU clock Freq = 675Mhz
}
}
}
// CUDA kernel declaration (without returning anything)
__global__ void gpu_kernel2(UWord8* gpu_output_buffer_1, UWord8* gpu_output_buffer_2, UWord8* stream_demux_output_buffer)
{
printf("GPU Kernel 2: Waiting for Buffer \r\n");
while (true)
{
if(strm_dmx_output_buffer_flag == BUFFER_FILLED)
{
if(gpu_output_buffer_1_flag == BUFFER_EMPTY)
{
memcpy(gpu_output_buffer_1, stream_demux_output_buffer, strm_dmx_output_buffer_size);
gpu_output_buffer_1_size = strm_dmx_output_buffer_size;
gpu_output_buffer_1_flag = BUFFER_FILLED;
strm_dmx_output_buffer_flag = BUFFER_EMPTY;
}
else if(gpu_output_buffer_2_flag == BUFFER_EMPTY)
{
memcpy(gpu_output_buffer_2, stream_demux_output_buffer, strm_dmx_output_buffer_size);
gpu_output_buffer_2_size = strm_dmx_output_buffer_size;
gpu_output_buffer_2_flag = BUFFER_FILLED;
strm_dmx_output_buffer_flag = BUFFER_EMPTY;
}
else
{
gpuSleep(675);
}
}
else if(strm_dmx_output_buffer_flag == GPU_CLOSE)
{
gpu_output_buffer_1_flag = GPU_CLOSE;
gpu_output_buffer_2_flag = GPU_CLOSE;
printf("Closing GPU Kernel 2 Thread \r\n");
break;
}
else
{
gpuSleep(675);
}
}
}
void allocateGPUBuffers()
{
cudaMalloc((void **)&gpu_input_buffer_1, BUFFER_SIZE * sizeof(char));
cudaMalloc((void **)&gpu_input_buffer_2, BUFFER_SIZE * sizeof(char));
cudaMalloc((void **)&stream_demux_input_buffer, BUFFER_SIZE * sizeof(char));
cudaMalloc((void **)&stream_demux_output_buffer, BUFFER_SIZE * sizeof(char));
cudaMalloc((void **)&gpu_output_buffer_1, BUFFER_SIZE * sizeof(char));
cudaMalloc((void **)&gpu_output_buffer_2, BUFFER_SIZE * sizeof(char));
}
void DeallocateGPUBuffers()
{
cudaFree(gpu_input_buffer_1);
cudaFree(gpu_input_buffer_2);
cudaFree(stream_demux_input_buffer);
cudaFree(stream_demux_output_buffer);
cudaFree(gpu_output_buffer_1);
cudaFree(gpu_output_buffer_2);
}
void gpu_input_process()
{
cudaStream_t stream1, stream2;
// Allocate GPU Buffers
allocateGPUBuffers();
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cout<<"Host Side: Before GPU Kernels " << endl;
gpu_kernel1<<<1, 1, 0, stream1>>>(gpu_input_buffer_1, gpu_input_buffer_2, stream_demux_input_buffer);
gpu_kernel2<<<1, 1, 0, stream2>>>(gpu_output_buffer_1, gpu_output_buffer_2, stream_demux_output_buffer);
cout<<"Host Side: Before Stream Demux OpenACC Async" << endl;
#pragma acc data deviceptr(stream_demux_input_buffer, stream_demux_output_buffer)
#pragma acc kernels async(1)
{
stream_demux1(stream_demux_input_buffer, stream_demux_output_buffer);
}
cout<<"Host Side: After Stream Demux OpenACC Async" << endl;
while(1)
{
cout<<"CPU Function Running" << endl;
sleep(1);
// Logic FOR CPU
}
DeallocateGPUBuffers();
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
}
int main(int argc, char **argv)
{
gpu_input_process();
return 0;
}