This is my computer’s settings:
RTX3070
When I try to run FFmpeg h264_nvenc and opencl at the same time, the program blocks, and this only happens with 32-bit applications, it works fine when running with 64-bit.
At the same time, it is normal to run 32-bit applications on some computers, such as the GPU information below:
RTX2060 with Max-Q design 30.0.14.9729
opencl thread blocked in clCreateContext
#include<cstdio>
#include<iostream>
#include <string>
#include <thread>
#include <CL/cl.h>
extern "C" {
#include <libavcodec/avcodec.h>
#include <libavutil/opt.h>
#include <libavutil/imgutils.h>
#include <libavutil/avutil.h>
#include <libavutil/error.h>
#include <libavutil/frame.h>
#include <libavutil/opt.h>
#include <libavutil/pixdesc.h>
#include <libavutil/display.h>
#include <libavutil/mastering_display_metadata.h>
#include <libavformat/avformat.h>
#include <libavformat/avio.h>
#include <libavcodec/avcodec.h>
#include <libavcodec/bsf.h>
}
#pragma warning( disable : 4996 )
#define KERNEL(...) #__VA_ARGS__
std::string getPlatformName(const cl_platform_id pid) {
size_t param_value_size;
clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, ¶m_value_size);
char param_value[2048];
clGetPlatformInfo(pid, CL_PLATFORM_NAME, param_value_size, param_value, NULL);
std::string strdevicetype(param_value);
return strdevicetype;
}
cl_platform_id getPlatFormIdx()
{
cl_platform_id platform;
cl_uint platformNum = 0;
cl_int status = clGetPlatformIDs(0, NULL, &platformNum);
std::cout << "platformNum = " << platformNum << std::endl;
cl_platform_id* plat = (cl_platform_id*)malloc(platformNum * sizeof(cl_platform_id));
status = clGetPlatformIDs(platformNum, plat, NULL);
int independenrGPUIdx = 0;
if (platformNum > 1) {
std::string nvidia_string = "NVIDIA";
for (int n_plat = 0; n_plat < platformNum; n_plat++) {
std::string palt_name = getPlatformName(plat[n_plat]);
if (palt_name.find(nvidia_string) != std::string::npos) {
independenrGPUIdx = n_plat;
}
}
}
else {
independenrGPUIdx = 0;
}
platform = plat[independenrGPUIdx];
free(plat);
std::cout << "NVIDIA independenrGPUIdx = " << independenrGPUIdx << std::endl;
return platform;
}
std::string programString = KERNEL(
__kernel void ocl_kernel_test(__global const unsigned char* imgSrc, int rows, int cols, __global unsigned char* imgDst)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
if (dx < cols && dy < rows)
{
int idx = dy * rows + dx;
uchar4 rgba = vload4(0, imgSrc + 4 * idx);
vstore4(rgba, 0, imgDst + 4 * (dy * rows + dx));
}
}
);
void opencl_test()
{
cl_int status = 0;
cl_uint platformnum = 0;
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue command_queue;
cl_program program;
std::cout << "into opencl_test" << std::endl;
platform = getPlatFormIdx();
cl_uint deviceNum = 0;
std::cout << "00start clGetDeviceIDs" << std::endl;
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceNum);
if (deviceNum < 1) {
std::cout << "Device Num < 1 !"<< std::endl;
return ;
}
std::cout << "00end clGetDeviceIDs" << std::endl;
std::cout << "11start clGetDeviceIDs" << std::endl;
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
std::cout << "11end clGetDeviceIDs" << std::endl;
int index = 0;
cl_context_properties context_properties[11] = { 0 };
context_properties[index++] = CL_CONTEXT_PLATFORM;
context_properties[index++] = (cl_context_properties)platform;
std::cout << "22start clCreateContext" << std::endl;
//block here
context = clCreateContext(context_properties, 1, &device,
[](const char* errinfo, const void* private_info, size_t cb, void* user_data) {
std::cout << errinfo << std::endl;
}, NULL, &status);
std::cout << "22end clCreateContext" << std::endl;
std::cout << "33start clCreateCommandQueue" << std::endl;
command_queue = clCreateCommandQueue(context, device, 0, &status);
std::cout << "33end clCreateCommandQueue" << std::endl;
const char* programSource = programString.c_str();
size_t programSize = programString.length();
program = clCreateProgramWithSource(context, 1, (const char**)&programSource, &programSize, &status);
std::string option = "-DT=float -DT4=float4 -DT8=float8 -DREAD_IMAGET=read_imagef -DWRITE_IMAGET=write_imagef -DCONVERT_T4=convert_float4";
status = clBuildProgram(program, 1, &device, option.c_str(), NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "ocl_kernel_test", &status);
const int width = 480;
const int height = 720;
unsigned char* src_cpu = (unsigned char*)malloc(width * height * 4);
unsigned char* dst_cpu = (unsigned char*)malloc(width * height * 4);
memset(dst_cpu, 0, width * height * 4);
for (int i = 0; i < width * height; i++) {
src_cpu[4 * i] = 86;
src_cpu[4 * i + 1] = 128;
src_cpu[4 * i + 2] = 64;
src_cpu[4 * i + 3] = 255;
}
cl_mem cl_input = clCreateBuffer(context, CL_MEM_READ_WRITE, width * height * 4 * sizeof(unsigned char), NULL, &status);
cl_mem cl_output = clCreateBuffer(context, CL_MEM_READ_WRITE, width * height * 4 * sizeof(unsigned char), NULL, &status);
for (int i = 0; i < 10; i++) {
clEnqueueWriteBuffer(command_queue, cl_input, CL_TRUE, 0, width * height * 4, src_cpu, 0, nullptr, nullptr);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_input);
status = clSetKernelArg(kernel, 1, sizeof(int), &height);
status = clSetKernelArg(kernel, 2, sizeof(int), &width);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_output);
size_t localSize[2] = { size_t(16),size_t(16) };
size_t globalSize[2] = { size_t((width)),size_t((height)) };
status = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, NULL);
clFinish(command_queue);
clEnqueueReadBuffer(command_queue, cl_output, CL_TRUE, 0, width * height * 4, dst_cpu, 0, nullptr, nullptr);
}
for (int i = 1024; i < 1028; i++) {
std::cout << "i = " << i << " dst_val = " << (int)dst_cpu[i] << std::endl;
}
free(src_cpu);
free(dst_cpu);
clReleaseMemObject(cl_input);
clReleaseMemObject(cl_output);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
std::cout << "end opencl_test" << std::endl;
}
static void encode(AVCodecContext* enc_ctx, AVFrame* frame, AVPacket* pkt,
FILE* outfile)
{
std::cout << "FFMpeg encode" << std::endl;
int ret;
/* send the frame to the encoder */
if (frame) {
//printf("Send frame %3"PRId64"\n", frame->pts);
}
ret = avcodec_send_frame(enc_ctx, frame);
if (ret < 0) {
fprintf(stderr, "Error sending a frame for encoding\n");
exit(1);
}
while (ret >= 0) {
std::cout << "avcodec_receive_packet" << std::endl;
ret = avcodec_receive_packet(enc_ctx, pkt);
if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
return;
else if (ret < 0) {
fprintf(stderr, "Error during encoding\n");
exit(1);
}
//printf("Write packet %3"PRId64" (size=%5d)\n", pkt->pts, pkt->size);
fwrite(pkt->data, 1, pkt->size, outfile);
av_packet_unref(pkt);
}
}
void ffmpeng_encode_test()
{
const char* filename, * codec_name;
const AVCodec* codec;
AVCodecContext* c = NULL;
int i, ret, x, y;
FILE* f;
AVFrame* frame;
AVPacket* pkt;
uint8_t endcode[] = { 0, 0, 1, 0xb7 };
filename = "nvenc_video.mp4";//argv[1];
codec_name = "h264_nvenc";//argv[2];
/* find the mpeg1video encoder */
std::cout << "avcodec_find_encoder_by_name" << std::endl;
codec = avcodec_find_encoder_by_name(codec_name);
if (!codec) {
fprintf(stderr, "Codec '%s' not found\n", codec_name);
exit(1);
}
std::cout << "avcodec_alloc_context3" << std::endl;
c = avcodec_alloc_context3(codec);
if (!c) {
fprintf(stderr, "Could not allocate video codec context\n");
exit(1);
}
std::cout << "av_packet_alloc" << std::endl;
pkt = av_packet_alloc();
if (!pkt)
exit(1);
/* put sample parameters */
c->bit_rate = 400000;
/* resolution must be a multiple of two */
c->width = 640;
c->height = 960;
/* frames per second */
AVRational base;
base.num = 1;
base.den = 25;
c->time_base = base;
AVRational framerate;
framerate.num = 25;
framerate.den = 1;
c->framerate = framerate;
c->gop_size = 10;
c->max_b_frames = 1;
c->pix_fmt = AV_PIX_FMT_YUV420P;
if (codec->id == AV_CODEC_ID_H264)
av_opt_set(c->priv_data, "preset", "slow", 0);
/* open it */
std::cout << "avcodec_open2" << std::endl;
ret = avcodec_open2(c, codec, NULL);
if (ret < 0) {
//printf(stderr, "Could not open codec: %s\n", av_err2str(ret));
printf("Could not open codec\n");
exit(1);
}
std::cout << " f = fopen" << std::endl;
f = fopen(filename, "wb");
if (!f) {
fprintf(stderr, "Could not open %s\n", filename);
exit(1);
}
std::cout << "av_frame_alloc" << std::endl;
frame = av_frame_alloc();
if (!frame) {
fprintf(stderr, "Could not allocate video frame\n");
exit(1);
}
frame->format = c->pix_fmt;
frame->width = c->width;
frame->height = c->height;
std::cout << "av_frame_get_buffer" << std::endl;
ret = av_frame_get_buffer(frame, 0);
if (ret < 0) {
fprintf(stderr, "Could not allocate the video frame data\n");
exit(1);
}
/* encode 1 second of video */
for (i = 0; i < 250; i++) {
printf("---encode i = %d \n", i);
fflush(stdout);
ret = av_frame_make_writable(frame);
if (ret < 0)
exit(1);
/* Y */
for (y = 0; y < c->height; y++) {
for (x = 0; x < c->width; x++) {
frame->data[0][y * frame->linesize[0] + x] = x + y + i * 3;
}
}
/* Cb and Cr */
for (y = 0; y < c->height / 2; y++) {
for (x = 0; x < c->width / 2; x++) {
frame->data[1][y * frame->linesize[1] + x] = 128 + y + i * 2;
frame->data[2][y * frame->linesize[2] + x] = 64 + x + i * 5;
}
}
frame->pts = i;
/* encode the image */
encode(c, frame, pkt, f);
}
/* flush the encoder */
encode(c, NULL, pkt, f);
if (codec->id == AV_CODEC_ID_MPEG1VIDEO || codec->id == AV_CODEC_ID_MPEG2VIDEO)
fwrite(endcode, 1, sizeof(endcode), f);
fclose(f);
avcodec_free_context(&c);
av_frame_free(&frame);
av_packet_free(&pkt);
}
#define RUN_NVENC_TEST 0
#define RUN_OCL_TEST 0
#define RUN_NVENC_OCL_TEST 1
int main()
{
std::cout << "*****start nvenc opencl test*****" << std::endl;
std::cout << av_version_info() << std::endl;
#if RUN_NVENC_TEST
ffmpeng_encode_test();
#endif
#if RUN_OCL_TEST
opencl_test();
#endif
#if RUN_NVENC_OCL_TEST
std::thread t1(ffmpeng_encode_test);
std::thread t2(opencl_test);
t1.join();
t2.join();
#endif
std::cout << "*****end nvenc opencl test*****" << std::endl;
return 0;
}