Cuda kernel have different results between host machine and docker container

Bobo-l · December 25, 2023, 8:02am

I write a cuda kernel for resize and normlize image, the kernel return right results on host machine, but it return all zero when i compile and run it in docker image.

kernel.cu

__global__ void rgb_crop_norm_kernel(const uint8_t *src, float *dst, float fx_scale, float fy_scale, float fx_offset,
                                     float fy_offset, int src_width, int src_height, int dst_width, int dst_height,
                                     float mean_r, float mean_g, float mean_b, float scale_r, float scale_g,
                                     float scale_b) {
  const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
  const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;

  if (dst_x >= dst_width || dst_y >= dst_height) {
    return;
  }

  const int src_size = src_width * src_height;
  float src_x = dst_x * fx_scale + fx_offset;
  float src_y = dst_y * fy_scale + fy_offset;
  const int x1 = max(__float2int_rd(src_x), 0);
  const int y1 = max(__float2int_rd(src_y), 0);
  const int x1_read = x1;
  const int y1_read = y1;
  const int x2 = x1 + 1;
  const int y2 = y1 + 1;
  const int x2_read = min(x2, src_width - 1);
  const int y2_read = min(y2, src_height - 1);

  int idx11 = (y1_read * src_width + x1_read);
  int idx12 = (y1_read * src_width + x2_read);
  int idx21 = (y2_read * src_width + x1_read);
  int idx22 = (y2_read * src_width + x2_read);
  float weight11 = (x2 - src_x) * (y2 - src_y);
  float weight12 = (src_x - x1) * (y2 - src_y);
  float weight21 = (x2 - src_x) * (src_y - y1);
  float weight22 = (src_x - x1) * (src_y - y1);

  uchar3 src11 = make_uchar3(src[idx11], src[idx11 + src_size], src[idx11 + src_size * 2]);
  uchar3 src12 = make_uchar3(src[idx12], src[idx12 + src_size], src[idx12 + src_size * 2]);
  uchar3 src21 = make_uchar3(src[idx21], src[idx21 + src_size], src[idx21 + src_size * 2]);
  uchar3 src22 = make_uchar3(src[idx22], src[idx22 + src_size], src[idx22 + src_size * 2]);
  float3 out;
  out.x = src11.x * weight11 + src12.x * weight12 + src21.x * weight21 + src22.x * weight22;
  out.y = src11.y * weight11 + src12.y * weight12 + src21.y * weight21 + src22.y * weight22;
  out.z = src11.z * weight11 + src12.z * weight12 + src21.z * weight21 + src22.z * weight22;

  float out_r = (clampF(out.x, 0.0f, 255.0f) - mean_r) / scale_r;
  float out_g = (clampF(out.y, 0.0f, 255.0f) - mean_g) / scale_g;
  float out_b = (clampF(out.z, 0.0f, 255.0f) - mean_b) / scale_b;

  const int dst_idx = dst_y * dst_width + dst_x;
  dst[dst_idx] = out_r;
  dst[dst_idx + dst_width * dst_height] = out_g;
  dst[dst_idx + dst_width * dst_height * 2] = out_b;
}

void RGB_CropNorm(const uint8_t *src, float *dst, const RectI &roi, const Shape2DI &src_shape, const Shape2DI &dst_shape,
                  const Point3DF &mean, const Point3DF &scale, cudaStream_t stream) {
  dim3 block(32, 32);
  dim3 grid(divup(dst_shape.width, block.x), divup(dst_shape.height, block.y));
  float fx_scale = 1.0f * roi.GetWidth() / dst_shape.width;
  float fy_scale = 1.0f * roi.GetHeight() / dst_shape.height;
  float fx_offset = roi.l + 0.5f * fx_scale - 0.5f;
  float fy_offset = roi.t + 0.5f * fy_scale - 0.5f;
  rgb_crop_norm_kernel<<<grid, block, 0, stream>>>(src, dst, fx_scale, fy_scale, fx_offset, fy_offset, src_shape.width,
                                                   src_shape.height, dst_shape.width, dst_shape.height, mean.x, mean.y,
                                                   mean.z, scale.x, scale.y, scale.z);
}

main.cpp

#include <stdio.h>
#include <math.h>
#include <cuda_runtime_api.h>
#include "kernel/norm.h"
#include <opencv2/opencv.hpp>


cv::Mat hwc2chw(cv::Mat &image){
    std::vector<cv::Mat> rgb_images;
    cv::split(image, rgb_images);
    cv::Mat m_flat_r = rgb_images[0].reshape(1,1);
    cv::Mat m_flat_g = rgb_images[1].reshape(1,1);
    cv::Mat m_flat_b = rgb_images[2].reshape(1,1);
    cv::Mat matArray[] = { m_flat_r, m_flat_g, m_flat_b};
    cv::Mat flat_image;
    cv::hconcat(matArray, 3, flat_image);
    return flat_image;
}

int main(void){
    cv::Mat img = cv::imread("test.jpg");
    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    cv::Mat chw;
    chw = hwc2chw(img);
    size_t src_mem_size = sizeof(uchar) * img.cols * img.rows * img.channels();
    int dst_size = 480 * 270;
    int dst_h = 270;
    int dst_w = 480;
    int dst_channel = 3;
    size_t dst_mem_size = sizeof(float) * dst_h * dst_w * dst_channel;
    float *norm_data = (float*)malloc(dst_mem_size);
    uchar *img_cuda;
    float *norm_data_cuda;
    cudaMalloc((void **)&img_cuda, src_mem_size);
    cudaMalloc((void **)&norm_data_cuda, dst_mem_size);
    cudaMemcpy(img_cuda, chw.data, src_mem_size, cudaMemcpyHostToDevice);
    RGB_CropNorm(img_cuda, norm_data_cuda,  RectI{0, 0, 3840, 2160}, Shape2DI{3840, 2160}, Shape2DI{480, 270}, Point3DF{0.0, 0.0, 0.0}, Point3DF{1.0, 1.0, 1.0}, nullptr);
    cudaMemcpy(norm_data, norm_data_cuda, dst_mem_size, cudaMemcpyDeviceToHost);
    std::vector<float> r(norm_data, norm_data + dst_size);
    std::vector<float> g(norm_data + dst_size, norm_data + dst_size * 2);
    std::vector<float> b(norm_data + dst_size *2, norm_data + dst_size * 3);
    for(size_t i =0; i < 100; i++){
        printf("%f , %f, %f",r[i], g[i], b[i]);
    }
    cv::Mat matArray[] = {cv::Mat(dst_h, dst_w, CV_32FC1, b.data()), cv::Mat(dst_h, dst_w, CV_32FC1, g.data()), cv::Mat(dst_h, dst_w, CV_32FC1, r.data())};
    cv::Mat merge_image;
    cv::merge(matArray, 3, merge_image);
    cv::imwrite("img.jpg", merge_image);
    free(norm_data);
    cudaFree(img_cuda);
    cudaFree(norm_data_cuda);
    return 0;
}

host
host env:
ubuntu 20.04

nvidia-driver

compile and print some value

and the result image is correct.

In docker

I use two different docker, the kernel run in both of they return error result , all error.

docker one from nvidia triton-server:

FROM nvcr.io/nvidia/tritonserver:23.08-py3

docker two from nvidia tensorrt-llm

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
ARG BASE_TAG=23.10-py3:

as above , the kernel result is all 0.

Robert_Crovella · December 25, 2023, 3:35pm

I generally suggest that when people are having difficulty with a CUDA code, that they first employ proper CUDA error checking.

If it were me, and my base machine had a CUDA 12.0 capable driver, I personally would not try to use a docker container that had a newer CUDA version in it.

Bobo-l · December 26, 2023, 3:00am

thanks, I use a new docker with cuda 11.4, return is right

system · January 9, 2024, 3:01am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
cudaMemcpy() returns success and copy incorrect data CUDA Programming and Performance	3	2134	March 4, 2017
Concurrent Kernel executions & Data Transfers CUDA Programming and Performance cuda	3	597	March 8, 2023
Need Help. CUDA kernel fails randomly CUDA Programming and Performance cuda , kernel	3	506	July 27, 2022
Kernel Convolution with streams provides no benefit CUDA Programming and Performance	4	34	January 20, 2025
Re_arranging Cuda array CUDA Programming and Performance	8	40	September 23, 2024
The kernel isn't working CUDA Programming and Performance	9	1050	January 19, 2011
what's wrong with my code? CUDA Programming and Performance	6	3955	July 7, 2009
Cuda 11.4.2 docker image driver version mismatch CUDA Setup and Installation	2	4229	January 7, 2022
2D Convolution problem following example from SDK source code included CUDA Programming and Performance	9	11642	June 7, 2011
CUDA code giving wrong result CUDA Programming and Performance	0	399	May 4, 2020

Cuda kernel have different results between host machine and docker container

Related topics