Hi
I wrote a cuda program,but its running speed is very slow.Please help me see to the error of the
program.i have put the source code in the attachment. Thank you very much.
softWare: ubuntu
hardWare: nvidia Tegra K1
Here is platform information:
device name: GK20A
computer ability: 3.2
MaxGridSize: 2147483647,65535,65535
MaxThreadPerBlock: 1024
MaxThreadDim: 1024,1024,64
multiProcessorCount: 1
resPerBlock: 32768(K)
sharedMemoryPerBlock(K): 48
totalGlobalMemory: 1892(M)
warpSize: 32
ConstanMemory: 64(K)
Test:
GPU :kernel.cu
time of Census: 441.06 ms
time of TransposeDSI: 1205.99 ms
CPU :stereo.cpp
time of census: 346 ms
time of TransposeDSI: 44 ms
main.cpp
//#define USE_CUDA
void StereoTest(void)
{
const int disp_num = 64;
cudaInit();
Mat imgLeft = imread("1-L.png",0);
Mat imgRight= imread("1-R.png",0);
cv::Size img_size;
img_size.width = 1024;
img_size.height = 400;
cv::resize(imgLeft, imgLeft, img_size);
cv::resize(imgRight, imgRight, img_size);
int height = imgLeft.rows;
int width = imgLeft.cols;
u16 *dsi =(u16 *)malloc(sizeof(int)*width*height*disp_num);
u16 *dsiAgg=(u16 *)malloc(sizeof(int)*width*height*disp_num);
cout << "runing..." <<endl;
#ifdef USE_CUDA
memset(dsiAgg, 0, height * width * disp_num* sizeof(unsigned short));
cudaCensus(imgLeft, imgRight, dsiAgg);
#else
clock_t start = clock();
Census((uchar *)imgLeft.data, (uchar *)imgRight.data, height, width, disp_num, dsi);
printf("time of census: %.3f s\n", (double)(clock()-start)/CLOCKS_PER_SEC);
clock_t TransposeDSI_clock = clock();
TransposeDSI(width, height, disp_num, dsiAgg, dsi);
printf("time of TransposeDSI: %.3f s\n", (double)(clock()-TransposeDSI_clock)/CLOCKS_PER_SEC);
printf("total time: %.3f s\n\n\n", (double)(clock()-start)/CLOCKS_PER_SEC);
#endif
free(dsi);
free(dsiAgg);
}
int main(int argc, char *argv[])
{
StereoTest();
return 0;
}
stereo.cpp
#include "stereo.h"
inline uint64 getCensus(uchar* source, int width, int i, int j)
{
uint64 value = 0;
uchar center = source[i*width + j];
for (int m = -3; m <= 3; m++)
{
for (int n = -4; n <= 4; n++)
{
value = (value << 1);
value += (source[(i+m)*width+j+n] < center);
}
}
return value;
}
uchar hamDist16(ushort x, ushort y)
{
uchar dist = 0;
ushort val = (x^y);
while(val)
{
++dist;
val &= val - 1;
}
return dist;
}
void Census(uchar *left, uchar *right, int height, int width, int dispnum, u16 *dsi)
{
static bool firstRun = true;
static uchar *popcount16LUT = new uchar[65536];
uint64 *leftCensus = new uint64[width];
uint64 *rightCensus = new uint64[width];
if(firstRun)
{
for (int i = 0; i < 65536; i++) {
popcount16LUT[i] = hamDist16(i,0);
}
memset(dsi,0,height*width*dispnum*sizeof(u16));
firstRun = false;
}
for (int i = 3; i < height - 3; i++)
{
for (int j = 4; j < width - 4; j++)
{
leftCensus[j] = getCensus(left, width, i, j);
rightCensus[j] = getCensus(right, width, i, j);
}
// Hamming Distance
for (int j = 4; j < width - 4; j++)
{
for (int d = 0; d < dispnum; d++)
{
if( d <= j - 4)
{
// total 44 bytes
dsi[(i*width+j)*dispnum + d] = popcount16LUT[ (ushort)(leftCensus[j] ^ rightCensus[j-d]) ]
+ popcount16LUT[ (ushort)((leftCensus[j]>>16) ^ (rightCensus[j-d]>>16)) ]
+ popcount16LUT[ (ushort)((leftCensus[j]>>32) ^ (rightCensus[j-d]>>32)) ];
}
else
{
dsi[(i*width+j)*dispnum + d] = 0xff;
}
}
}
}
delete [] leftCensus;
delete [] rightCensus;
}
void TransposeDSI(int width, int height, int n_disp, u16 *dsiSrc, u16 *dsiDes)
{
int index1, index2;
for(int j=0;j<height;j++)
{
for(int i=0;i<width;i++)
{
index1=j*width*n_disp+i*n_disp;
index2=i*height*n_disp+j*n_disp;
memcpy(dsiDes+index2, dsiSrc+index1, sizeof(u16)*n_disp);
}
}
}
kernel.cu
static bool firstRun = true;
static uchar *popcount16LUT = new uchar[65536];
static const int dispnum = 64;
static unsigned char hamDist_16(ushort x, ushort y)
{
unsigned char dist = 0;
unsigned short val = (x^y);
while(val)
{
++dist;
val &= val - 1;
}
return dist;
}
__global__ void TransposeDSI(unsigned short *dsiSrc, unsigned short *dsiDes, int width, int height, int n_disp)
{
int i = threadIdx.x;
int j = blockIdx.x;
int index1, index2;
int d;
index1=j*width*n_disp+i*n_disp;
index2=i*height*n_disp+j*n_disp;
dsiDes += index2;
dsiSrc += index1;
for(d = 0; d < n_disp; d++) {
dsiDes[d] = dsiSrc[d];
dsiSrc[d] = 0;
}
//memcpy(dsiDes+index2, dsiSrc+index1, sizeof(unsigned short)*n_disp);
}
__global__ void getCensus(unsigned char *left_data, unsigned char *right_data,
unsigned char *popcount16LUT, unsigned short *dsi,
int height, int width)
{
__shared__ unsigned long long int leftCensus[1024];
__shared__ unsigned long long int rightCensus[1024];
unsigned long long int left_val = 0;
unsigned long long int right_val = 0;
unsigned long long int val_l;
unsigned long long int val_r;
int tid = threadIdx.x;
int bid = blockIdx.x;
if(tid >= 4 && bid >= 3) {
unsigned char left_center = left_data[bid*width + tid];
unsigned char right_center = right_data[bid*width + tid];
for (int m = -3; m <= 3; m++)
{
for (int n = -4; n <= 4; n++)
{
left_val = (left_val << 1);
left_val += (left_data[(bid+m) * width + tid + n] < left_center);
right_val = (right_val << 1);
right_val += (right_data[(bid+m) * width + tid + n] < right_center);
}
}
leftCensus[tid] = left_val;
rightCensus[tid] = right_val;
__syncthreads();
for (int d = 0; d < 64; d++)
{
if( d <= tid - 4)
{
val_l = leftCensus[tid];
val_r = rightCensus[tid-d];
// total 44 bytes
dsi[(bid*width+tid)*64 + d] = popcount16LUT[ (unsigned short)(val_l ^ val_r)]
+ popcount16LUT[(unsigned short)((val_l>>16) ^ (val_r>>16))]
+ popcount16LUT[(unsigned short)((val_l>>32) ^ (val_r>>32))];
}
else
{
dsi[(bid*width+tid)*64 + d] = 0xff;
}
}
}
else {
leftCensus[tid] = 0;
rightCensus[tid] = 0;
}
}
uchar *left_data, *right_data;
float *disp_img;
unsigned short *out_dist;
unsigned short *dist_agg;
extern "C"
void cudaInit()
{
HANDLE_ERROR(cudaMalloc( (void**)&left_data, WIDTH * HEIGHT * sizeof(uchar)));
HANDLE_ERROR(cudaMalloc( (void**)&right_data, HEIGHT * WIDTH * sizeof(uchar)));
HANDLE_ERROR(cudaMalloc( (void**)&out_dist, sizeof(unsigned short)*WIDTH*HEIGHT*dispnum));
HANDLE_ERROR(cudaMalloc( (void**)&dist_agg, sizeof(unsigned short)*WIDTH*HEIGHT*dispnum));
HANDLE_ERROR(cudaMalloc( (void**)&disp_img, WIDTH * HEIGHT *sizeof(float)));
if(firstRun)
{
HANDLE_ERROR(cudaMalloc( (void**)&popcount16LUT, 65536));
static uchar *tmp_popcount16LUT = new uchar[65536];
// build the popcount16LUT
for (int i = 0; i < 65536; i++) {
tmp_popcount16LUT[i] = hamDist_16(i,0);
}
HANDLE_ERROR(cudaMemcpy(popcount16LUT, tmp_popcount16LUT, 65536, cudaMemcpyHostToDevice));
firstRun = false;
}
}
extern "C"
int cudaCensus(cv::Mat& left_img, cv::Mat& right_img, unsigned short *dsi) {
int height = left_img.rows;
int width = left_img.cols;
int data_size = height * width * sizeof(uchar);
HANDLE_ERROR(cudaMemcpy(left_data, left_img.data, data_size, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(right_data, right_img.data, data_size, cudaMemcpyHostToDevice));
cudaEvent_t getCensus_start, getCensus_stop;
cudaEventCreate(&getCensus_start);
cudaEventCreate(&getCensus_stop);
cudaEventRecord(getCensus_start, NULL);
getCensus<<<height, width>>>(left_data, right_data, popcount16LUT, out_dist, height, width);
cudaDeviceSynchronize();
cudaEventRecord(getCensus_stop, NULL);
cudaEventSynchronize(getCensus_stop);
float getCensus_time = 0.0f;
cudaEventElapsedTime(&getCensus_time, getCensus_start, getCensus_stop);
cudaEventDestroy(getCensus_start);
cudaEventDestroy(getCensus_stop);
std::cout << " getCensus time=" << getCensus_time<<std::endl;
/* TransposeDSI */
cudaEvent_t TransposeDSI_start, TransposeDSI_stop;
cudaEventCreate(&TransposeDSI_start);
cudaEventCreate(&TransposeDSI_stop);
cudaEventRecord(TransposeDSI_start, NULL);
TransposeDSI<<<height, width>>>(out_dist, dist_agg, width, height, dispnum);
cudaDeviceSynchronize();
cudaEventRecord(TransposeDSI_stop, NULL);
cudaEventSynchronize(TransposeDSI_stop);
float TransposeDSI_time = 0.0f;
cudaEventElapsedTime(&TransposeDSI_time, TransposeDSI_start, TransposeDSI_stop);
cudaEventDestroy(TransposeDSI_start);
cudaEventDestroy(TransposeDSI_stop);
std::cout << " TransposeDSI time=" << TransposeDSI_time<<std::endl;
HANDLE_ERROR(cudaMemcpy(dsi, dist_agg, width * height *sizeof(unsigned short), cudaMemcpyDeviceToHost));
return 0;
}
Revised:
12/22/2016
__global__ void getCensus( unsigned long int *leftCensus,unsigned long int *rightCensus, int height, int width)
{
//GridDim.x = height;
//blockDim.x = width;
unsigned long int left_val = 0;
unsigned long int right_val = 0;
int m,n;
unsigned char left_center,right_center;
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx*width+tx;
if(tx >= 4 && bx >= 3) {
left_center = tex2D(tex2Dleft, tx, bx);
right_center = tex2D(tex2Dright, tx, bx);
for (m = -3; m <= 3; m++)
{
for (n = -4; n <= 4; n++)
{
left_val = (left_val << 1);
left_val += (tex2D(tex2Dleft, tx+n, bx+m) < left_center);
right_val = (right_val << 1);
right_val += (tex2D(tex2Dright, tx+n, bx+m) < right_center);
}
}
leftCensus[offset] = left_val;
rightCensus[offset] = right_val;
}
else {
leftCensus[offset] = 0;
rightCensus[offset] = 0;
}
}
__global__ void census( unsigned short *dist,int height, int width)
{
//blockDim.x = 64;
//blockDim.y = 1024 / 64 = 16;
//GridDim.x = (width*height) / 16;
unsigned long long int val_l;
unsigned long long int val_r;
int x,y;
int tx= threadIdx.x;
int ty= threadIdx.y;
int bx = blockIdx.x;
int offset = (bx * 16 + ty);
y = (int)(bx * 16/width);
x = offset - y*width;
int dist_offset = offset*64;
if(x >= 4 && y >= 3) {
if( tx <= x - 4)
{
val_l = tex2D(left_census_2D,x,y);
val_r = tex2D(right_census_2D, x-tx,y);
dist[dist_offset + tx] = tex1Dfetch(popcount16LUT_1D, (unsigned short)(val_l ^ val_r))
+ tex1Dfetch(popcount16LUT_1D, (unsigned short)((val_l>>16) ^ (val_r>>16)))
+ tex1Dfetch(popcount16LUT_1D, (unsigned short)((val_l>>32) ^ (val_r>>32)));
}
else
{
dist[dist_offset + tx] = 0xff;
}
}
}
__global__ void TransposeDSI(unsigned short *dsiSrc, unsigned short *dsiDes, int width, int height, int n_disp)
{
//blockDim.x = 64;
//blockDim.y = 1024 / 64 = 16;
//GridDim.x = (width*height) / 16;
int x,y;
int tx= threadIdx.x;
int ty= threadIdx.y;
int bx = blockIdx.x;
int offset = (bx * 16 + ty);
y = (int)(bx * 16/width);
x = offset - y*width;
int index1, index2;
index1= y*width*n_disp+x*n_disp + tx;
index2= x*height*n_disp+y*n_disp +tx;
dsiDes[index2] = dsiSrc[index1];
dsiSrc[index1] = 0;
}
StereoHL.zip (582 KB)