cuda program running speed is very slow

Hi
I wrote a cuda program,but its running speed is very slow.Please help me see to the error of the
program.i have put the source code in the attachment. Thank you very much.

softWare: ubuntu
hardWare: nvidia Tegra K1

Here is platform information:
device name: GK20A
computer ability: 3.2
MaxGridSize: 2147483647,65535,65535
MaxThreadPerBlock: 1024
MaxThreadDim: 1024,1024,64
multiProcessorCount: 1
resPerBlock: 32768(K)
sharedMemoryPerBlock(K): 48
totalGlobalMemory: 1892(M)
warpSize: 32
ConstanMemory: 64(K)

Test:
GPU :kernel.cu
time of Census: 441.06 ms
time of TransposeDSI: 1205.99 ms

CPU :stereo.cpp
time of census: 346 ms
time of TransposeDSI: 44 ms

main.cpp

//#define  USE_CUDA
void StereoTest(void)
{
    const int disp_num = 64;
    cudaInit();
    Mat imgLeft  = imread("1-L.png",0);
    Mat imgRight= imread("1-R.png",0);
    cv::Size img_size;
    img_size.width = 1024;
    img_size.height = 400;
    cv::resize(imgLeft, imgLeft, img_size);
    cv::resize(imgRight, imgRight, img_size);
    int height = imgLeft.rows;
    int width  = imgLeft.cols;
    u16 *dsi      =(u16 *)malloc(sizeof(int)*width*height*disp_num);
    u16 *dsiAgg=(u16 *)malloc(sizeof(int)*width*height*disp_num);
    cout << "runing..." <<endl;
 
#ifdef USE_CUDA
    memset(dsiAgg, 0, height * width * disp_num* sizeof(unsigned short));
    cudaCensus(imgLeft, imgRight, dsiAgg);
#else
    clock_t start = clock();
    Census((uchar *)imgLeft.data, (uchar *)imgRight.data, height, width, disp_num, dsi);
    printf("time of census: %.3f s\n", (double)(clock()-start)/CLOCKS_PER_SEC);
 
    clock_t TransposeDSI_clock = clock();
    TransposeDSI(width, height, disp_num, dsiAgg, dsi);
    printf("time of TransposeDSI: %.3f s\n", (double)(clock()-TransposeDSI_clock)/CLOCKS_PER_SEC);
 
    printf("total time: %.3f s\n\n\n", (double)(clock()-start)/CLOCKS_PER_SEC);
#endif
 
    free(dsi);
    free(dsiAgg);
}
 
int main(int argc, char *argv[])
{   
    StereoTest();
    return 0;
}

stereo.cpp

#include "stereo.h"
inline uint64 getCensus(uchar* source, int width, int i, int j)
{
    uint64 value = 0;
    uchar center = source[i*width + j];
    for (int m = -3; m <= 3; m++)
    {
        for (int n =  -4; n <= 4; n++)
        {
            value = (value << 1);
            value += (source[(i+m)*width+j+n] < center);
        }
    }
    return value;
}
uchar hamDist16(ushort x, ushort y)
{
    uchar dist = 0;
    ushort val = (x^y);
    while(val)
    {
        ++dist;
        val &= val - 1;
    }
    return dist;
}
void Census(uchar *left, uchar *right, int height, int width, int dispnum, u16 *dsi)
{
    static bool firstRun = true;
    static uchar *popcount16LUT = new uchar[65536];
    uint64 *leftCensus   = new uint64[width];
    uint64 *rightCensus = new uint64[width];
    if(firstRun)
    {
        for (int i = 0; i < 65536; i++) {
            popcount16LUT[i] = hamDist16(i,0);
        }
 
        memset(dsi,0,height*width*dispnum*sizeof(u16));
        firstRun = false;
    }
    for (int i = 3; i < height - 3; i++)
    {
        for (int j = 4; j < width - 4; j++)
        {
            leftCensus[j]   = getCensus(left, width, i, j);
            rightCensus[j] = getCensus(right, width, i, j);
        }
        // Hamming Distance
        for (int j = 4; j < width - 4; j++)
        {
            for (int d = 0; d < dispnum; d++)
            {
                if( d <= j - 4)
                {
                    // total 44 bytes
                    dsi[(i*width+j)*dispnum + d] = popcount16LUT[ (ushort)(leftCensus[j] ^ rightCensus[j-d]) ]
                            + popcount16LUT[ (ushort)((leftCensus[j]>>16) ^ (rightCensus[j-d]>>16)) ]
                            + popcount16LUT[ (ushort)((leftCensus[j]>>32) ^ (rightCensus[j-d]>>32)) ];
                }
                else
                {
                    dsi[(i*width+j)*dispnum + d] = 0xff;
                }
            }
        }
    }
    delete [] leftCensus;
    delete [] rightCensus;
}
void TransposeDSI(int width, int height, int n_disp, u16 *dsiSrc, u16 *dsiDes)
{
    int index1, index2;
    for(int j=0;j<height;j++)
    {
        for(int i=0;i<width;i++)
        {
            index1=j*width*n_disp+i*n_disp;
            index2=i*height*n_disp+j*n_disp;
            memcpy(dsiDes+index2, dsiSrc+index1, sizeof(u16)*n_disp);
        }
    }
}

kernel.cu

static bool firstRun = true;
static uchar *popcount16LUT = new uchar[65536];
static const int dispnum = 64;
 
static unsigned char hamDist_16(ushort x, ushort y)
{
    unsigned char dist = 0;
    unsigned short val = (x^y);
    while(val)
    {
        ++dist;
        val &= val - 1;
    }
    return dist;
}
 
__global__ void TransposeDSI(unsigned short *dsiSrc, unsigned short *dsiDes, int width, int height, int n_disp)
{
    int i = threadIdx.x;
    int j = blockIdx.x;
    int index1, index2;
    int d;
    index1=j*width*n_disp+i*n_disp;
    index2=i*height*n_disp+j*n_disp;
    dsiDes += index2;
    dsiSrc += index1;
    for(d = 0; d < n_disp; d++) {
        dsiDes[d] = dsiSrc[d];
        dsiSrc[d] = 0;
    }
    //memcpy(dsiDes+index2, dsiSrc+index1, sizeof(unsigned short)*n_disp);
}
 
__global__ void getCensus(unsigned char *left_data, unsigned char *right_data,
                          unsigned char *popcount16LUT, unsigned short *dsi,
                          int height, int width)
{
    __shared__ unsigned long long int leftCensus[1024];
    __shared__ unsigned long long int rightCensus[1024];
    unsigned long long int left_val = 0;
    unsigned long long int right_val = 0;
    unsigned long long int val_l;
    unsigned long long int val_r;
    int tid = threadIdx.x;
    int bid = blockIdx.x;
    if(tid >= 4 && bid >= 3) {
        unsigned char left_center = left_data[bid*width + tid];
        unsigned char right_center = right_data[bid*width + tid];
 
        for (int m = -3; m <= 3; m++)
        {
            for (int n =  -4; n <= 4; n++)
            {
                left_val = (left_val << 1);
                left_val += (left_data[(bid+m) * width + tid + n] < left_center);
                right_val = (right_val << 1);
                right_val += (right_data[(bid+m) * width + tid + n] < right_center);
            }
        }
        leftCensus[tid] = left_val;
        rightCensus[tid] = right_val;
 
        __syncthreads();
        for (int d = 0; d < 64; d++)
        {
            if( d <= tid - 4)
            {
                val_l = leftCensus[tid];
                val_r = rightCensus[tid-d];
            // total 44 bytes
                dsi[(bid*width+tid)*64 + d] = popcount16LUT[ (unsigned short)(val_l ^ val_r)]
                    + popcount16LUT[(unsigned short)((val_l>>16) ^ (val_r>>16))]
                    + popcount16LUT[(unsigned short)((val_l>>32) ^ (val_r>>32))];
            }
            else
            {
                dsi[(bid*width+tid)*64 + d] = 0xff;
            }
        }
    }
    else {
        leftCensus[tid] = 0;
        rightCensus[tid] = 0;
    }
 
}
 
uchar *left_data, *right_data;
float *disp_img;
unsigned short *out_dist;
unsigned short *dist_agg;
 
extern "C"
void cudaInit()
{
    HANDLE_ERROR(cudaMalloc( (void**)&left_data, WIDTH * HEIGHT * sizeof(uchar)));
    HANDLE_ERROR(cudaMalloc( (void**)&right_data, HEIGHT * WIDTH * sizeof(uchar)));
    HANDLE_ERROR(cudaMalloc( (void**)&out_dist, sizeof(unsigned short)*WIDTH*HEIGHT*dispnum));
    HANDLE_ERROR(cudaMalloc( (void**)&dist_agg, sizeof(unsigned short)*WIDTH*HEIGHT*dispnum));
    HANDLE_ERROR(cudaMalloc( (void**)&disp_img, WIDTH * HEIGHT *sizeof(float)));
    if(firstRun)
    {
        HANDLE_ERROR(cudaMalloc( (void**)&popcount16LUT, 65536));
        static uchar *tmp_popcount16LUT = new uchar[65536];
        // build the popcount16LUT
        for (int i = 0; i < 65536; i++) {
            tmp_popcount16LUT[i] = hamDist_16(i,0);
        }
        HANDLE_ERROR(cudaMemcpy(popcount16LUT, tmp_popcount16LUT, 65536, cudaMemcpyHostToDevice));
        firstRun = false;
    }
}
 
extern "C"
int cudaCensus(cv::Mat& left_img, cv::Mat& right_img, unsigned short *dsi) {
    int height = left_img.rows;
    int width  = left_img.cols;
    int data_size = height * width * sizeof(uchar);
    HANDLE_ERROR(cudaMemcpy(left_data, left_img.data, data_size, cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(right_data, right_img.data, data_size, cudaMemcpyHostToDevice));
 
    cudaEvent_t getCensus_start, getCensus_stop;
    cudaEventCreate(&getCensus_start);
    cudaEventCreate(&getCensus_stop);
    cudaEventRecord(getCensus_start, NULL);
    getCensus<<<height, width>>>(left_data, right_data, popcount16LUT, out_dist, height, width);
    cudaDeviceSynchronize();
    cudaEventRecord(getCensus_stop, NULL);
    cudaEventSynchronize(getCensus_stop);
    float getCensus_time = 0.0f;
    cudaEventElapsedTime(&getCensus_time, getCensus_start, getCensus_stop);
    cudaEventDestroy(getCensus_start);
    cudaEventDestroy(getCensus_stop);
    std::cout << " getCensus  time=" << getCensus_time<<std::endl;
    /* TransposeDSI */
    cudaEvent_t TransposeDSI_start, TransposeDSI_stop;
    cudaEventCreate(&TransposeDSI_start);
    cudaEventCreate(&TransposeDSI_stop);
    cudaEventRecord(TransposeDSI_start, NULL);
    TransposeDSI<<<height, width>>>(out_dist, dist_agg, width, height, dispnum);
    cudaDeviceSynchronize();
    cudaEventRecord(TransposeDSI_stop, NULL);
    cudaEventSynchronize(TransposeDSI_stop);
    float TransposeDSI_time = 0.0f;
    cudaEventElapsedTime(&TransposeDSI_time, TransposeDSI_start, TransposeDSI_stop);
    cudaEventDestroy(TransposeDSI_start);
    cudaEventDestroy(TransposeDSI_stop);
    std::cout << " TransposeDSI time=" << TransposeDSI_time<<std::endl;
    HANDLE_ERROR(cudaMemcpy(dsi, dist_agg, width * height *sizeof(unsigned short), cudaMemcpyDeviceToHost));
    return 0;
}

Revised:
12/22/2016

__global__ void getCensus( unsigned long int *leftCensus,unsigned long int *rightCensus, int height, int width)
{
    //GridDim.x = height;
    //blockDim.x = width;
    unsigned long int left_val = 0;
    unsigned long int right_val = 0;
    int m,n;
    unsigned char left_center,right_center;

    int tx = threadIdx.x;
    int bx = blockIdx.x;
    int offset = bx*width+tx;
    if(tx >= 4 && bx >= 3) {

        left_center = tex2D(tex2Dleft, tx, bx);
        right_center = tex2D(tex2Dright, tx, bx);

        for (m = -3; m <= 3; m++)
        {
            for (n =  -4; n <= 4; n++)
            {
                left_val = (left_val << 1);
                left_val += (tex2D(tex2Dleft, tx+n, bx+m) < left_center);

                right_val = (right_val << 1);
                right_val += (tex2D(tex2Dright, tx+n, bx+m) < right_center);
            }
        }
        leftCensus[offset] = left_val;
        rightCensus[offset] = right_val;

    }
    else {
        leftCensus[offset] = 0;
        rightCensus[offset] = 0;
    }

}

__global__ void census( unsigned short *dist,int height, int width)
{
    //blockDim.x = 64;
    //blockDim.y = 1024 / 64 = 16;
    //GridDim.x = (width*height) / 16;
    unsigned long long int val_l;
    unsigned long long int val_r;
    int x,y;
    int tx= threadIdx.x;
    int ty= threadIdx.y;
    int bx = blockIdx.x;
    int offset = (bx * 16 + ty);
    y = (int)(bx * 16/width);
    x = offset - y*width;
    int dist_offset = offset*64;

    if(x >= 4 && y >= 3) {

        if( tx <= x - 4)
        {
                val_l = tex2D(left_census_2D,x,y);
                val_r = tex2D(right_census_2D, x-tx,y);

                dist[dist_offset + tx] = tex1Dfetch(popcount16LUT_1D, (unsigned short)(val_l ^ val_r))
                    + tex1Dfetch(popcount16LUT_1D, (unsigned short)((val_l>>16) ^ (val_r>>16)))
                    + tex1Dfetch(popcount16LUT_1D, (unsigned short)((val_l>>32) ^ (val_r>>32)));

        }
        else
        {
                dist[dist_offset + tx] = 0xff;
        }

    }
}

__global__ void TransposeDSI(unsigned short *dsiSrc, unsigned short *dsiDes, int width, int height, int n_disp)
{

    //blockDim.x = 64;
    //blockDim.y = 1024 / 64 = 16;
    //GridDim.x = (width*height) / 16;
    int x,y;
    int tx= threadIdx.x;
    int ty= threadIdx.y;

    int bx = blockIdx.x;

    int offset = (bx * 16 + ty);
    y = (int)(bx * 16/width);
    x = offset - y*width;

    int index1, index2;

    index1= y*width*n_disp+x*n_disp + tx;
    index2= x*height*n_disp+y*n_disp +tx;
    dsiDes[index2] = dsiSrc[index1];
    dsiSrc[index1] = 0;
}

StereoHL.zip (582 KB)

Uncoalesced, uncached access to bytewise image data in global memory… multiple accesses of neighbor pixels per thread. That’s your problem.

→ Put the data into a texture or consider declaring left_data, right_data as const restrict.

Christian

test

As “cbuchner1” said, there are several items you could work on. You probably should start in the following order:

  1. Use tools like NSight or nvprof to profile your program’s timeline, find out the time each part costs, identify the most time-consuming part(s).
  2. Those tools also have guided optimization analysis, try them, you probably will found the tool says your memory access is not efficient. Please spend some time to read and have a good understanding about occupancy, memory coalescing, global memory vs shared memory vs texture memory, and re-design your kernels. You should be able to improve a lot on this.
  3. Think about re-design your algorithm to better utilize the GPU.
  4. Come back and report your progress, gets more feedback from the forum.

Good luck!

thanks,i use shared memory speed increase is obvious.

thank you a lot for your advice. i reduce internal loop and use shared and texture memory speed increase is obvious.i will continue to ask question to the forum for help.i have put the amendment in the previous.