GPU slower than CPU in COO SpMV Multiplication?

Hi.

I am experimenting with SpMV multiplication. Matrix is in COO format. I use kernels from the cusp-library (Google Code Archive - Long-term storage for Google Code Project Hosting.). Here’s the full source code of my programe:

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include

#include <sys/time.h>

#include <cutil.h>

#include <cutil_inline.h>

#include “mmio.c”

#define DIVIDE_INTO(x, y) ((x + y - 1)/y)

const unsigned int WARP_SIZE = 32;

const unsigned int MAX_THREADS = 512;

const unsigned int BLOCK_SIZE = 256;

const unsigned int MAX_BLOCKS = MAX_THREADS / (2 * BLOCK_SIZE);

const unsigned int WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;

const bool UseCache = false;

typedef int IndexType;

typedef double ValueType;

texture<int2,1> tex_x_double;

void bind_x(const double * x)

{ CUDA_SAFE_CALL(cudaBindTexture(NULL, tex_x_double, x)); }

void unbind_x(const double * x)

{ CUDA_SAFE_CALL(cudaUnbindTexture(tex_x_double)); }

inline device double fetch_x(const int& i, const double * x)

{

#if CUDA_ARCH >= 130

// double requires Compute Capability 1.3 or greater

if (UseCache)

{

  int2 v = tex1Dfetch(tex_x_double, i);

  return __hiloint2double(v.y, v.x);

}

else

{

  return x[i];

}

#else

return 1.0/0.0; // should never be called

#endif

}

texture<float,1> tex_x_float;

void bind_x(const float * x)

{ CUDA_SAFE_CALL(cudaBindTexture(NULL, tex_x_float, x)); }

void unbind_x(const float * x)

{ CUDA_SAFE_CALL(cudaUnbindTexture(tex_x_float)); }

inline device float fetch_x(const int& i, const float * x)

{

if (UseCache)

    return tex1Dfetch(tex_x_float, i);

else

    return x[i];

}

void checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

if( cudaSuccess != err)

{

  fprintf(stderr, "Cuda error: %s: %s.\n", msg, 

    cudaGetErrorString( err) );

  exit(EXIT_FAILURE);

}

}

// segmented reduction in shared memory

device ValueType segreduce_warp(const IndexType thread_lane, IndexType row, ValueType val, IndexType * rows, ValueType * vals)

{

rows[threadIdx.x] = row;

vals[threadIdx.x] = val;

if( thread_lane >= 1 && row == rows[threadIdx.x - 1] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 1]; }

if( thread_lane >=  2 && row == rows[threadIdx.x -  2] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x -  2]; }

if( thread_lane >=  4 && row == rows[threadIdx.x -  4] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x -  4]; }

if( thread_lane >=  8 && row == rows[threadIdx.x -  8] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x -  8]; }

if( thread_lane >= 16 && row == rows[threadIdx.x - 16] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 16]; }

return val;

}

global void

spmv_coo_kernel(const IndexType num_nonzeros,

                 const IndexType interval_size,

                 const IndexType * I, 

                 const IndexType * J, 

                 const ValueType * V, 

                 const ValueType * x, 

                       ValueType * y,

                       IndexType * temp_rows,

                       ValueType * temp_vals)

{

__shared__ volatile IndexType rows[48 *(BLOCK_SIZE/32)];

__shared__ volatile ValueType vals[BLOCK_SIZE];

const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index

const IndexType thread_lane = threadIdx.x & (WARP_SIZE-1);                           // thread index within the warp

const IndexType warp_id     = thread_id   / WARP_SIZE;                               // global warp index

const IndexType interval_begin = warp_id * interval_size; // warp’s offset into I,J,V

const IndexType interval_end   = min(interval_begin + interval_size, num_nonzeros);  // end of warps's work

const IndexType idx = 16 * (threadIdx.x/32 + 1) + threadIdx.x; // thread’s index into padded rows array

rows[idx - 16] = -1; // fill padding with invalid row index

if(interval_begin >= interval_end) // warp has no work to do

    return;

if (thread_lane == 31)

{

    // initialize the carry in values

    rows[idx] = I[interval_begin]; 

    vals[threadIdx.x] = 0;

}

for(IndexType n = interval_begin + thread_lane; n < interval_end; n += WARP_SIZE)

{

    IndexType row = I[n];                                         // row index (i)

    ValueType val = V[n] * fetch_x(J[n], x);            // A(i,j) * x(j)

if (thread_lane == 0)

    {

        if(row == rows[idx + 31])

            val += vals[threadIdx.x + 31];                        // row continues

        else

            y[rows[idx + 31]] += vals[threadIdx.x + 31];  // row terminated

    }

rows[idx] = row;

    vals[threadIdx.x] = val;

if(row == rows[idx - 1]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 1]; }

    if(row == rows[idx -  2]) { vals[threadIdx.x] = val = val + vals[threadIdx.x -  2]; }

    if(row == rows[idx -  4]) { vals[threadIdx.x] = val = val + vals[threadIdx.x -  4]; }

    if(row == rows[idx -  8]) { vals[threadIdx.x] = val = val + vals[threadIdx.x -  8]; }

    if(row == rows[idx - 16]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 16]; }

if(thread_lane < 31 && row != rows[idx + 1])

        y[row] += vals[threadIdx.x];                                            // row terminated

}

if(thread_lane == 31)

{

    // write the carry out values

    temp_rows[warp_id] = rows[idx];

    temp_vals[warp_id] = vals[threadIdx.x];

}

}

// Segmented redction

device void segreduce_block(const IndexType * idx, ValueType * val)

{

ValueType left = 0;

if( threadIdx.x >=   1 && idx[threadIdx.x] == idx[threadIdx.x -   1] ) { left = val[threadIdx.x -   1]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();  

if( threadIdx.x >=   2 && idx[threadIdx.x] == idx[threadIdx.x -   2] ) { left = val[threadIdx.x -   2]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();

if( threadIdx.x >=   4 && idx[threadIdx.x] == idx[threadIdx.x -   4] ) { left = val[threadIdx.x -   4]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();

if( threadIdx.x >=   8 && idx[threadIdx.x] == idx[threadIdx.x -   8] ) { left = val[threadIdx.x -   8]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();

if( threadIdx.x >=  16 && idx[threadIdx.x] == idx[threadIdx.x -  16] ) { left = val[threadIdx.x -  16]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();

if( threadIdx.x >=  32 && idx[threadIdx.x] == idx[threadIdx.x -  32] ) { left = val[threadIdx.x -  32]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();  

if( threadIdx.x >=  64 && idx[threadIdx.x] == idx[threadIdx.x -  64] ) { left = val[threadIdx.x -  64]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();

if( threadIdx.x >= 128 && idx[threadIdx.x] == idx[threadIdx.x - 128] ) { left = val[threadIdx.x - 128]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();

if( threadIdx.x >= 256 && idx[threadIdx.x] == idx[threadIdx.x - 256] ) { left = val[threadIdx.x - 256]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();

}

// The second level of the segmented reduction operation

global void

spmv_coo_reduce_update_kernel(const IndexType num_warps,

                          const IndexType * temp_rows,

                          const ValueType * temp_vals,

                                ValueType * y)

{

__shared__ IndexType rows[BLOCK_SIZE + 1];    

__shared__ ValueType vals[BLOCK_SIZE + 1];    

const IndexType end = num_warps - (num_warps & (BLOCK_SIZE - 1));

if (threadIdx.x == 0)

{

    rows[BLOCK_SIZE] = (IndexType) -1;

    vals[BLOCK_SIZE] = (ValueType)  0;

}

__syncthreads();

IndexType i = threadIdx.x;

while (i < end)

{

    // do full blocks

    rows[threadIdx.x] = temp_rows[i];

    vals[threadIdx.x] = temp_vals[i];

__syncthreads();

segreduce_block(rows, vals);

if (rows[threadIdx.x] != rows[threadIdx.x + 1])

        y[rows[threadIdx.x]] += vals[threadIdx.x];

__syncthreads();

i += BLOCK_SIZE;

}

if (end < num_warps){

    if (i < num_warps){

        rows[threadIdx.x] = temp_rows[i];

        vals[threadIdx.x] = temp_vals[i];

    } else {

        rows[threadIdx.x] = (IndexType) -1;

        vals[threadIdx.x] = (ValueType)  0;

    }

__syncthreads();

segreduce_block(rows, vals);

if (i < num_warps)

        if (rows[threadIdx.x] != rows[threadIdx.x + 1])

            y[rows[threadIdx.x]] += vals[threadIdx.x];

}

}

global void

spmv_coo_serial_kernel(const IndexType num_nonzeros,

                   const IndexType * I, 

                   const IndexType * J, 

                   const ValueType * V, 

                   const ValueType * x, 

                         ValueType * y)

{

for(IndexType n = 0; n < num_nonzeros; n++){

    y[I[n]] += V[n] * x[J[n]];

}

}

// prosty algorytm mnozacy na CPU

void

spmv_coo_CPU(const IndexType num_nonzeros,

     const IndexType * I,

     const IndexType * J,

     const ValueType * V,

     const ValueType * x,

       	   ValueType * y)

{

IndexType i;

for (i = 0; i < num_nonzeros; i++)

  y[I[i]] += V[i] * x[J[i]];

}

int main(int argc, char *argv)

{

int ret_code;

MM_typecode matcode;

FILE *f;

int M, N, nz;

int i, s;

struct timeval tStart, tStop, tInterval;

unsigned int hTimer;

double czas;

IndexType *h_I, *h_J;

ValueType *h_V, *h_x, *h_y, *h_y_cpu;

if (argc < 2)

{

  fprintf(stderr, "Uzycie: %s [nazwa-pliku]\n", argv[0]);

  exit(1);

}

else

{

  if ((f = fopen(argv[1], "r")) == NULL) 

 exit(1);

}

if (mm_read_banner(f, &matcode) != 0)

{

  printf("Could not process Matrix Market banner.\n");

  exit(1);

}

/* This is how one can screen matrix types if their application */

/* only supports a subset of the Matrix Market data types. */

if (mm_is_complex(matcode) && mm_is_matrix(matcode) &&

 mm_is_sparse(matcode) )

{

  printf("Sorry, this application does not support ");

  printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));

  exit(1);

}

printf(“Loading data…\n”);

if ((ret_code = mm_read_mtx_crd_size(f, &N, &M, &nz)) != 0)

  exit(1);

size_t size_index = nz * sizeof(IndexType);

size_t size_value = nz * sizeof(ValueType);

h_I = (IndexType *) malloc(size_index);

h_J = (IndexType *) malloc(size_index);

h_V = (ValueType *) malloc(size_value);

for (i = 0; i < nz; ++i)

{

  fscanf(f, "%d %d %lf\n", &h_J[i], &h_I[i], &h_V[i]);

  h_I[i]--;

  h_J[i]--;

}

if (f != stdin) fclose(f);

size_t size_vec = M * sizeof(ValueType);

h_x = (ValueType *) malloc(size_vec);

for (i = 0; i < M; i++)

  h_x[i] = 2.0+i*0.005;

h_y = (ValueType *) calloc(M, sizeof(ValueType));

h_y_cpu = (ValueType *) calloc(M, sizeof(ValueType));

IndexType *d_I, *d_J;

ValueType *d_V, *d_x, *d_y;

cudaMalloc((void**)&d_I, size_index);

cudaMalloc((void**)&d_J, size_index);

cudaMalloc((void**)&d_V, size_value);

cudaMalloc((void**)&d_x, size_vec);

cudaMalloc((void**)&d_y, size_vec);

checkCUDAError(“cudaMalloc”);

cudaMemcpy(d_I, h_I, size_index, cudaMemcpyHostToDevice);

cudaMemcpy(d_J, h_J, size_index, cudaMemcpyHostToDevice);

cudaMemcpy(d_V, h_V, size_value, cudaMemcpyHostToDevice);

cudaMemcpy(d_x, h_x, size_vec, cudaMemcpyHostToDevice);

cudaMemcpy(d_y, h_y, size_vec, cudaMemcpyHostToDevice);

checkCUDAError(“cudaMemcpy to device”);

s = M/10;

if (s == 0)

  s = 1;

printf(“s = %d\n”, s);

printf(“CPU multiplication…\n”);

cutCreateTimer(&hTimer);

  cutResetTimer(hTimer);

  cudaThreadSynchronize();

  cutStartTimer(hTimer);

spmv_coo_CPU(nz, h_I, h_J, h_V, h_x, h_y_cpu);

cutStopTimer(hTimer);

printf(“Execution time (CPU) = %lf ms\n”, cutGetTimerValue(hTimer));

cutDeleteTimer(hTimer);

printf(“Result:\n”);

for (int k = 0; k < M; k+=s)

  printf("y[%d] = %lf\n", k, h_y_cpu[k]);

if (nz == 0)

  printf("Empty matrix\n");

else if (nz < WARP_SIZE)

{

  printf("Small matrix\n");

  spmv_coo_serial_kernel <<<1,1>>>

        (nz, d_I, d_J, d_V, d_x, d_y);

  cudaMemcpy(h_y, d_y, size_vec, cudaMemcpyDeviceToHost);

  printf("Result:\n");

  for (i = 0; i < M; ++i)

 printf("%lf ", h_y[i]);

  printf("\n");

}

else

{

  const unsigned int num_units = nz / WARP_SIZE;

  const unsigned int num_warps = std::min(num_units, WARPS_PER_BLOCK * MAX_BLOCKS);

  const unsigned int num_blocks = DIVIDE_INTO(num_warps, WARPS_PER_BLOCK);

  const unsigned int num_iters = DIVIDE_INTO(num_units, num_warps);

const unsigned int interval_size = WARP_SIZE * num_iters;

const IndexType tail = num_units * WARP_SIZE;

const unsigned int active_warps = (interval_size == 0) ? 0 : DIVIDE_INTO(tail, interval_size);

printf(“num_units = %u\n”, num_units);

  printf("num_warps = %u\n", num_warps);

  printf("num_blocks = %u\n", num_blocks);

  printf("num_inters = %u\n", num_iters);

  printf("interval_size = %u\n", interval_size);

  printf("tail = %d\n", tail);

  printf("active_warps = %u\n", active_warps);

if (UseCache)

 bind_x(d_x);

  checkCUDAError("bind");

IndexType * temp_rows;

  ValueType * temp_vals;

cudaMalloc((void**)&temp_rows, active_warps * sizeof(IndexType));

  cudaMalloc((void**)&temp_vals, active_warps * sizeof(ValueType));

// Odpalamy kernel

  printf("GPU multiplication...\n");

 cutilCheckError(cutCreateTimer(&hTimer));

 cutilCheckError(cutResetTimer(hTimer));

 cudaThreadSynchronize();

 cutilCheckError(cutStartTimer(hTimer));

 spmv_coo_kernel <<<num_blocks,BLOCK_SIZE>>> 

    (tail, interval_size, d_I, d_J, d_V, d_x, d_y, temp_rows, temp_vals);

 cudaThreadSynchronize();

 spmv_coo_serial_kernel <<<1,1>>>

    (nz - tail, d_I + tail, d_J + tail, d_V + tail, d_x, d_y);

 cudaThreadSynchronize();

 spmv_coo_reduce_update_kernel <<<1, 512>>> 

    (active_warps, temp_rows, temp_vals, d_y);

 cudaThreadSynchronize();

 cutilCheckError(cutStopTimer(hTimer));

 checkCUDAError("wywoanie kerneli");

 

 printf("Execution time (GPU) = %lf ms\n", cutGetTimerValue(hTimer));

 cutDeleteTimer(hTimer);

cudaFree(temp_rows);

  cudaFree(temp_vals);

if (UseCache)

 unbind_x(d_x);

cudaMemcpy(h_y, d_y, size_vec, cudaMemcpyDeviceToHost);

  checkCUDAError("cudaMemcpy to host");

printf(“Result:\n”);

  for (int k = 0; k < M; k+=s)

 printf("y[%d] = %lf\n", k, h_y[k]);

}

free(h_I);

free(h_J);

free(h_V);

free(h_x);

free(h_y);

free(h_y_cpu);

cudaFree(d_I);

cudaFree(d_J);

cudaFree(d_V);

cudaFree(d_x);

cudaFree(d_y);

return 0;

}

[/codebox]

Results of this programme are strange. CPU seems to be faster than GPU. Example results:

$ release/spmv ../mtx/hood.mtx

Loading data...

s = 22054

CPU multiplication...

Execution time (CPU) = 40.953999 ms

Result:

y[0] = 1063117684.283157

y[22054] = 12640207895.224644

y[44108] = -33255447.632059

y[66162] = 868046245.778653

y[88216] = 261818029.206334

y[110270] = 222523762.127168

y[132324] = 1407517451.698023

y[154378] = 327192007.079969

y[176432] = 4373661739.092607

y[198486] = -66860974.932954

y[220540] = 2416760813.382476

num_units = 171702

num_warps = 8

num_blocks = 1

num_inters = 21463

interval_size = 686816

tail = 5494464

active_warps = 8

GPU multiplication...

Execution time (GPU) = 86.750999 ms

Result:

y[0] = 1063117684.283157

y[22054] = 12640207895.224644

y[44108] = -33255447.632060

y[66162] = 868046245.778653

y[88216] = 261818029.206334

y[110270] = 222523762.127168

y[132324] = 1407517451.698022

y[154378] = 327192007.079969

y[176432] = 4373661739.092607

y[198486] = -66860974.932954

y[220540] = 2416760813.382476

GPU is over 2 times slower. Why? What am I doing wrong? My graphic card is GeForce GTX 275.