Hi again txbob!
Thanks for your interest. Here you have a version of the code without OpenCV dependencies. This is the main.cpp file:
#include <iostream>
#include "lodepng.h"
#include <cstdio>
#include <cuda_runtime.h>
#include <vector_types.h>
#include <driver_types.h>
#include <npp.h>
float* buildGaussKernel(double radius) {
int r = (int)floor(radius + 0.5);
int l = 2 * r + 1;
float* h = new float[l];
double sigma = radius;
double sumh = 0;
for (int i = 0; i < l; ++i) {
double x = i - r;
h[i] = (float)(exp (-(x * x) / (2 * sigma * sigma)));
sumh += h[i];
}
for (int i = 0; i < l; ++i) h[i] = float(h[i] /sumh);
return h;
}
void convertGreenUcharToGrayFloat(std::vector<unsigned char> colorImg, float* grayImg, int width, int height) {
for (int i=0; i<width; i++) {
for (int j=0; j<height; j++) {
grayImg[i + j*width] = static_cast<float>(colorImg[4*(i + j*width)+1]);
}
}
}
void convertGrayFloatToColorUchar(float* grayImg, unsigned char* colorImg, int width, int height) {
for (int i=0; i<width; i++) {
for (int j=0; j<height; j++) {
colorImg[4*(i + j*width)] = static_cast<unsigned char>(grayImg[i + j*width]);
colorImg[4*(i + j*width) + 1] = static_cast<unsigned char>(grayImg[i + j*width]);
colorImg[4*(i + j*width) + 2] = static_cast<unsigned char>(grayImg[i + j*width]);
}
}
}
void filterImageGauss(float* d_rgba, float* d_dest_rgba, float* dKernel, int radius, int width, int height, cudaStream_t stream) {
nppSetStream(stream);
const Npp32s nStep = static_cast<Npp32s>(width*sizeof(float));
NppStatus sp;
{
NppiSize szROI = {width-(2*radius+1), height};
int offset = radius;
sp = nppiFilterRow_32f_C1R(d_rgba+offset, nStep, d_dest_rgba+offset , nStep, szROI, dKernel, 2*radius+1, radius);
}
if (sp != NPP_NO_ERROR) return;
}
int main( int argc, char** argv )
{
std::vector<unsigned char> in_image;
lodepng::State state;
std::vector<unsigned char> input_file;
unsigned width = 1920;
unsigned height = 1080;
lodepng::load_file(input_file, "F:\testImage.png"); //load the image file with given filename
unsigned error = lodepng::decode(in_image, width, height, state, input_file);
if(in_image.size() == 0)
{
std::cout << "Could not open or find the image" << std::endl ;
return -1;
}
float* hImage=0;
hImage = new float[width*height];
convertGreenUcharToGrayFloat(in_image, hImage, width, height);
const int radiusFilter1 = 6;
const int radiusFilter2 = 2;
const int radiusFilter3 = 6;
float* hKernel1 = buildGaussKernel(radiusFilter1);
float* hKernel2 = buildGaussKernel(radiusFilter2);
float* hKernel3 = buildGaussKernel(radiusFilter3);
float* dKernel1;
float* dKernel2;
float* dKernel3;
cudaMalloc(&dKernel1, sizeof(Npp32f)*(2*radiusFilter1+1));
cudaMalloc(&dKernel2, sizeof(Npp32f)*(2*radiusFilter2+1));
cudaMalloc(&dKernel3, sizeof(Npp32f)*(2*radiusFilter3+1));
cudaMemcpy(dKernel1, hKernel1, sizeof(float)*(2*radiusFilter1+1), cudaMemcpyHostToDevice);
cudaMemcpy(dKernel2, hKernel2, sizeof(float)*(2*radiusFilter2+1), cudaMemcpyHostToDevice);
cudaMemcpy(dKernel3, hKernel3, sizeof(float)*(2*radiusFilter3+1), cudaMemcpyHostToDevice);
float* dImage1;
float* dImage2;
float* dImage3;
cudaMalloc(&dImage1, sizeof(float)*(width*height));
cudaMalloc(&dImage2, sizeof(float)*(width*height));
cudaMalloc(&dImage3, sizeof(float)*(width*height));
cudaMemcpy(dImage1, hImage, width*height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dImage2, hImage, width*height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dImage3, hImage, width*height*sizeof(float), cudaMemcpyHostToDevice);
cudaStream_t stream1;
cudaStream_t stream2;
cudaStream_t stream3;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
float* dFilteredImage1;
float* dFilteredImage2;
float* dFilteredImage3;
cudaMalloc(&dFilteredImage1, sizeof(float)*(width*height));
cudaMalloc(&dFilteredImage2, sizeof(float)*(width*height));
cudaMalloc(&dFilteredImage3, sizeof(float)*(width*height));
float* hFilteredImage1[100];
float* hFilteredImage2[100];
float* hFilteredImage3[100];
for (int i = 0; i<100; i++) {
hFilteredImage1[i] = new float[width*height];
hFilteredImage2[i] = new float[width*height];
hFilteredImage3[i] = new float[width*height];
}
for (int i = 0; i<100; i++) {
filterImageGauss(dImage1, dFilteredImage1, dKernel1, radiusFilter1, width, height, stream1);
filterImageGauss(dImage2, dFilteredImage2, dKernel2, radiusFilter2, width, height, stream2);
filterImageGauss(dImage3, dFilteredImage3, dKernel3, radiusFilter3, width, height, stream3);
cudaMemcpyAsync(hFilteredImage1[i], dFilteredImage1, width*height*sizeof(float), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(hFilteredImage2[i], dFilteredImage2, width*height*sizeof(float), cudaMemcpyDeviceToHost, stream2);
cudaMemcpyAsync(hFilteredImage3[i], dFilteredImage3, width*height*sizeof(float), cudaMemcpyDeviceToHost, stream3);
}
cudaDeviceSynchronize();
unsigned char* hFilteredImageU1 = new unsigned char[4*width*height];
unsigned char* hFilteredImageU2 = new unsigned char[4*width*height];
unsigned char* hFilteredImageU3 = new unsigned char[4*width*height];
for (int i = 0; i<100; i++) {
convertGrayFloatToColorUchar(hFilteredImage1[i], hFilteredImageU1, width, height);
convertGrayFloatToColorUchar(hFilteredImage2[i], hFilteredImageU2, width, height);
convertGrayFloatToColorUchar(hFilteredImage3[i], hFilteredImageU3, width, height);
char num[350]="F:\test\image";
char str[10];
char pmp[10]=".png";
itoa(i, str, 10);
strcat(num,str);
strcat(num,pmp);
error = lodepng::encode(num, hFilteredImageU2, width, height);
}
return 0;
}
The CMakeLists.txt file:
cmake_minimum_required(VERSION 2.8)
SET( THIS_PROJECT_NAME "nppFilterTest")
project( ${THIS_PROJECT_NAME} )
#########################
# CUDA
#########################
set(CUDA_TOOLKIT_ROOT_DIR "$ENV{CUDA_PATH_V7_0}") #if we have CUDA installed CUDA_PATH_Vx is defined
find_package(CUDA REQUIRED)
option(CUDA_MAXWELL_COMPATIBILITY "Build engine with maxwell support (CC 5.2)" OFF)
if(CUDA_MAXWELL_COMPATIBILITY)
set(CUDA_NVCC_FLAGS -gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_52,code=compute_52)
else()
set(CUDA_NVCC_FLAGS -gencode=arch=compute_30,code=sm_30)
endif()
set(CUDA_64_BIT_DEVICE_CODE ON)
include_directories("${CUDA_INCLUDE_DIRS}"
"$ENV{NVTOOLSEXT_PATH}/include")
link_directories("$ENV{NVTOOLSEXT_PATH}/lib/x64")
set(LIBS ${LIBS} "${CUDA_CUDA_LIBRARY}"
"${CUDA_LIBRARIES}"
"${CUDA_nppc_LIBRARY}"
"${CUDA_nppi_LIBRARY}"
nvToolsExt64_1)
SET (SOURCE_FILES
main.cpp
lodepng.cpp
)
SET (HEADER_FILES
lodepng.h
)
ADD_EXECUTABLE( ${THIS_PROJECT_NAME} ${SOURCE_FILES} ${HEADER_FILES} )
TARGET_LINK_LIBRARIES( ${THIS_PROJECT_NAME} ${LIBS} )
To load and save images from disk without OpenCV, I have included the lodepng.cpp and lodepng.h files in the project. You can download them here http://lodev.org/lodepng/.
I tried the code with this image, that I downloaded and renamed to “testImage.png”:
https://www.google.es/search?q=images&espv=2&biw=1920&bih=935&tbm=isch&source=lnt&tbs=isz:ex,iszw:1920,iszh:1080#imgrc=Ao8iqkc_YpUEjM%3A
I hope this helps to reproduce the issue.