Cuda kernel have different results between host machine and docker container

I write a cuda kernel for resize and normlize image, the kernel return right results on host machine, but it return all zero when i compile and run it in docker image.

kernel.cu

__global__ void rgb_crop_norm_kernel(const uint8_t *src, float *dst, float fx_scale, float fy_scale, float fx_offset,
                                     float fy_offset, int src_width, int src_height, int dst_width, int dst_height,
                                     float mean_r, float mean_g, float mean_b, float scale_r, float scale_g,
                                     float scale_b) {
  const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
  const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;

  if (dst_x >= dst_width || dst_y >= dst_height) {
    return;
  }

  const int src_size = src_width * src_height;
  float src_x = dst_x * fx_scale + fx_offset;
  float src_y = dst_y * fy_scale + fy_offset;
  const int x1 = max(__float2int_rd(src_x), 0);
  const int y1 = max(__float2int_rd(src_y), 0);
  const int x1_read = x1;
  const int y1_read = y1;
  const int x2 = x1 + 1;
  const int y2 = y1 + 1;
  const int x2_read = min(x2, src_width - 1);
  const int y2_read = min(y2, src_height - 1);

  int idx11 = (y1_read * src_width + x1_read);
  int idx12 = (y1_read * src_width + x2_read);
  int idx21 = (y2_read * src_width + x1_read);
  int idx22 = (y2_read * src_width + x2_read);
  float weight11 = (x2 - src_x) * (y2 - src_y);
  float weight12 = (src_x - x1) * (y2 - src_y);
  float weight21 = (x2 - src_x) * (src_y - y1);
  float weight22 = (src_x - x1) * (src_y - y1);

  uchar3 src11 = make_uchar3(src[idx11], src[idx11 + src_size], src[idx11 + src_size * 2]);
  uchar3 src12 = make_uchar3(src[idx12], src[idx12 + src_size], src[idx12 + src_size * 2]);
  uchar3 src21 = make_uchar3(src[idx21], src[idx21 + src_size], src[idx21 + src_size * 2]);
  uchar3 src22 = make_uchar3(src[idx22], src[idx22 + src_size], src[idx22 + src_size * 2]);
  float3 out;
  out.x = src11.x * weight11 + src12.x * weight12 + src21.x * weight21 + src22.x * weight22;
  out.y = src11.y * weight11 + src12.y * weight12 + src21.y * weight21 + src22.y * weight22;
  out.z = src11.z * weight11 + src12.z * weight12 + src21.z * weight21 + src22.z * weight22;

  float out_r = (clampF(out.x, 0.0f, 255.0f) - mean_r) / scale_r;
  float out_g = (clampF(out.y, 0.0f, 255.0f) - mean_g) / scale_g;
  float out_b = (clampF(out.z, 0.0f, 255.0f) - mean_b) / scale_b;

  const int dst_idx = dst_y * dst_width + dst_x;
  dst[dst_idx] = out_r;
  dst[dst_idx + dst_width * dst_height] = out_g;
  dst[dst_idx + dst_width * dst_height * 2] = out_b;
}

void RGB_CropNorm(const uint8_t *src, float *dst, const RectI &roi, const Shape2DI &src_shape, const Shape2DI &dst_shape,
                  const Point3DF &mean, const Point3DF &scale, cudaStream_t stream) {
  dim3 block(32, 32);
  dim3 grid(divup(dst_shape.width, block.x), divup(dst_shape.height, block.y));
  float fx_scale = 1.0f * roi.GetWidth() / dst_shape.width;
  float fy_scale = 1.0f * roi.GetHeight() / dst_shape.height;
  float fx_offset = roi.l + 0.5f * fx_scale - 0.5f;
  float fy_offset = roi.t + 0.5f * fy_scale - 0.5f;
  rgb_crop_norm_kernel<<<grid, block, 0, stream>>>(src, dst, fx_scale, fy_scale, fx_offset, fy_offset, src_shape.width,
                                                   src_shape.height, dst_shape.width, dst_shape.height, mean.x, mean.y,
                                                   mean.z, scale.x, scale.y, scale.z);
}

main.cpp

#include <stdio.h>
#include <math.h>
#include <cuda_runtime_api.h>
#include "kernel/norm.h"
#include <opencv2/opencv.hpp>


cv::Mat hwc2chw(cv::Mat &image){
    std::vector<cv::Mat> rgb_images;
    cv::split(image, rgb_images);
    cv::Mat m_flat_r = rgb_images[0].reshape(1,1);
    cv::Mat m_flat_g = rgb_images[1].reshape(1,1);
    cv::Mat m_flat_b = rgb_images[2].reshape(1,1);
    cv::Mat matArray[] = { m_flat_r, m_flat_g, m_flat_b};
    cv::Mat flat_image;
    cv::hconcat(matArray, 3, flat_image);
    return flat_image;
}

int main(void){
    cv::Mat img = cv::imread("test.jpg");
    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
    cv::Mat chw;
    chw = hwc2chw(img);
    size_t src_mem_size = sizeof(uchar) * img.cols * img.rows * img.channels();
    int dst_size = 480 * 270;
    int dst_h = 270;
    int dst_w = 480;
    int dst_channel = 3;
    size_t dst_mem_size = sizeof(float) * dst_h * dst_w * dst_channel;
    float *norm_data = (float*)malloc(dst_mem_size);
    uchar *img_cuda;
    float *norm_data_cuda;
    cudaMalloc((void **)&img_cuda, src_mem_size);
    cudaMalloc((void **)&norm_data_cuda, dst_mem_size);
    cudaMemcpy(img_cuda, chw.data, src_mem_size, cudaMemcpyHostToDevice);
    RGB_CropNorm(img_cuda, norm_data_cuda,  RectI{0, 0, 3840, 2160}, Shape2DI{3840, 2160}, Shape2DI{480, 270}, Point3DF{0.0, 0.0, 0.0}, Point3DF{1.0, 1.0, 1.0}, nullptr);
    cudaMemcpy(norm_data, norm_data_cuda, dst_mem_size, cudaMemcpyDeviceToHost);
    std::vector<float> r(norm_data, norm_data + dst_size);
    std::vector<float> g(norm_data + dst_size, norm_data + dst_size * 2);
    std::vector<float> b(norm_data + dst_size *2, norm_data + dst_size * 3);
    for(size_t i =0; i < 100; i++){
        printf("%f , %f, %f",r[i], g[i], b[i]);
    }
    cv::Mat matArray[] = {cv::Mat(dst_h, dst_w, CV_32FC1, b.data()), cv::Mat(dst_h, dst_w, CV_32FC1, g.data()), cv::Mat(dst_h, dst_w, CV_32FC1, r.data())};
    cv::Mat merge_image;
    cv::merge(matArray, 3, merge_image);
    cv::imwrite("img.jpg", merge_image);
    free(norm_data);
    cudaFree(img_cuda);
    cudaFree(norm_data_cuda);
    return 0;
}


host
host env:
ubuntu 20.04

nvidia-driver

compile and print some value

and the result image is correct.

In docker

I use two different docker, the kernel run in both of they return error result , all error.

docker one from nvidia triton-server:

FROM nvcr.io/nvidia/tritonserver:23.08-py3

docker two from nvidia tensorrt-llm

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
ARG BASE_TAG=23.10-py3:
image

as above , the kernel result is all 0.

I generally suggest that when people are having difficulty with a CUDA code, that they first employ proper CUDA error checking.

If it were me, and my base machine had a CUDA 12.0 capable driver, I personally would not try to use a docker container that had a newer CUDA version in it.

thanks, I use a new docker with cuda 11.4, return is right

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.