Hi, I’m currently new to CUDA programming and was doing a small project to read an image on openCV and change it into a structure named RGB which have three variables both R,G,B and my code is this
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
#include
#include
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
struct RGB {
uchar R;
uchar G;
uchar B;
};
global void seperate(uchar* img, RGB* rgb) {
int threadid = threadIdx.x + threadIdx.y * blockDim.x;
printf("%d\n", threadid);
(rgb + threadid)->R = img[threadid * 3];
(rgb + threadid)->G = img[threadid * 3 + 1];
(rgb + threadid)->B = img[threadid * 3 + 2];
}
int main(int arc,char** argv)
{
Mat img = imread(“D:/CUDA/Blur_Image/Lenna.png”, IMREAD_COLOR);
if (img.empty()) {
printf("이미지 없음\n");
return 1;
}
int num_row = img.rows;
int num_col = img.cols;
int num_cha = img.channels();
long img_size = num_row * num_col;
long img_size_w_cha = num_row * num_col * num_cha;
RGB* h_rgb_img = new RGB[img_size];
uchar* h_img = img.data;
RGB* d_rgb_img;
uchar* d_img;
cudaMalloc((void**)&d_rgb_img, img_size * sizeof(RGB));
cudaMalloc((void**)&d_img, img_size_w_cha * sizeof(uchar));
cudaMemcpy(d_img, h_img, img_size_w_cha * sizeof(uchar), cudaMemcpyHostToDevice);
dim3 grid(1, 0, 0);
dim3 block(num_row, num_col, 0);
seperate << <grid, block >> > (d_img, d_rgb_img);
cudaDeviceSynchronize();
cudaMemcpy(h_rgb_img, d_rgb_img, img_size * sizeof(RGB), cudaMemcpyDeviceToHost);
delete h_rgb_img;
cudaFree(d_img);
cudaFree(d_rgb_img);
return 0;
}
But, the code didn’t give the right value for the h_rgb_img so I tried using the Nsight debugger and not like the tutorials it didn’t give anything… all the memory which the device was pointing to was not allowing access while debugging and even though I put an break point to the kernel it didn’t go in… what am I doing wrong…
This are the pictures showing the symptoms