Hi.
I am experimenting with SpMV multiplication. Matrix is in COO format. I use kernels from the cusp-library (Google Code Archive - Long-term storage for Google Code Project Hosting.). Here’s the full source code of my programe:
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <sys/time.h>
#include <cutil.h>
#include <cutil_inline.h>
#include “mmio.c”
#define DIVIDE_INTO(x, y) ((x + y - 1)/y)
const unsigned int WARP_SIZE = 32;
const unsigned int MAX_THREADS = 512;
const unsigned int BLOCK_SIZE = 256;
const unsigned int MAX_BLOCKS = MAX_THREADS / (2 * BLOCK_SIZE);
const unsigned int WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
const bool UseCache = false;
typedef int IndexType;
typedef double ValueType;
texture<int2,1> tex_x_double;
void bind_x(const double * x)
{ CUDA_SAFE_CALL(cudaBindTexture(NULL, tex_x_double, x)); }
void unbind_x(const double * x)
{ CUDA_SAFE_CALL(cudaUnbindTexture(tex_x_double)); }
inline device double fetch_x(const int& i, const double * x)
{
#if CUDA_ARCH >= 130
// double requires Compute Capability 1.3 or greater
if (UseCache)
{
int2 v = tex1Dfetch(tex_x_double, i);
return __hiloint2double(v.y, v.x);
}
else
{
return x[i];
}
return 1.0/0.0; // should never be called
}
texture<float,1> tex_x_float;
void bind_x(const float * x)
{ CUDA_SAFE_CALL(cudaBindTexture(NULL, tex_x_float, x)); }
void unbind_x(const float * x)
{ CUDA_SAFE_CALL(cudaUnbindTexture(tex_x_float)); }
inline device float fetch_x(const int& i, const float * x)
{
if (UseCache)
return tex1Dfetch(tex_x_float, i);
else
return x[i];
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
// segmented reduction in shared memory
device ValueType segreduce_warp(const IndexType thread_lane, IndexType row, ValueType val, IndexType * rows, ValueType * vals)
{
rows[threadIdx.x] = row;
vals[threadIdx.x] = val;
if( thread_lane >= 1 && row == rows[threadIdx.x - 1] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 1]; }
if( thread_lane >= 2 && row == rows[threadIdx.x - 2] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 2]; }
if( thread_lane >= 4 && row == rows[threadIdx.x - 4] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 4]; }
if( thread_lane >= 8 && row == rows[threadIdx.x - 8] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 8]; }
if( thread_lane >= 16 && row == rows[threadIdx.x - 16] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 16]; }
return val;
}
global void
spmv_coo_kernel(const IndexType num_nonzeros,
const IndexType interval_size,
const IndexType * I,
const IndexType * J,
const ValueType * V,
const ValueType * x,
ValueType * y,
IndexType * temp_rows,
ValueType * temp_vals)
{
__shared__ volatile IndexType rows[48 *(BLOCK_SIZE/32)];
__shared__ volatile ValueType vals[BLOCK_SIZE];
const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
const IndexType thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp
const IndexType warp_id = thread_id / WARP_SIZE; // global warp index
const IndexType interval_begin = warp_id * interval_size; // warp’s offset into I,J,V
const IndexType interval_end = min(interval_begin + interval_size, num_nonzeros); // end of warps's work
const IndexType idx = 16 * (threadIdx.x/32 + 1) + threadIdx.x; // thread’s index into padded rows array
rows[idx - 16] = -1; // fill padding with invalid row index
if(interval_begin >= interval_end) // warp has no work to do
return;
if (thread_lane == 31)
{
// initialize the carry in values
rows[idx] = I[interval_begin];
vals[threadIdx.x] = 0;
}
for(IndexType n = interval_begin + thread_lane; n < interval_end; n += WARP_SIZE)
{
IndexType row = I[n]; // row index (i)
ValueType val = V[n] * fetch_x(J[n], x); // A(i,j) * x(j)
if (thread_lane == 0)
{
if(row == rows[idx + 31])
val += vals[threadIdx.x + 31]; // row continues
else
y[rows[idx + 31]] += vals[threadIdx.x + 31]; // row terminated
}
rows[idx] = row;
vals[threadIdx.x] = val;
if(row == rows[idx - 1]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 1]; }
if(row == rows[idx - 2]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 2]; }
if(row == rows[idx - 4]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 4]; }
if(row == rows[idx - 8]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 8]; }
if(row == rows[idx - 16]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 16]; }
if(thread_lane < 31 && row != rows[idx + 1])
y[row] += vals[threadIdx.x]; // row terminated
}
if(thread_lane == 31)
{
// write the carry out values
temp_rows[warp_id] = rows[idx];
temp_vals[warp_id] = vals[threadIdx.x];
}
}
// Segmented redction
device void segreduce_block(const IndexType * idx, ValueType * val)
{
ValueType left = 0;
if( threadIdx.x >= 1 && idx[threadIdx.x] == idx[threadIdx.x - 1] ) { left = val[threadIdx.x - 1]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
if( threadIdx.x >= 2 && idx[threadIdx.x] == idx[threadIdx.x - 2] ) { left = val[threadIdx.x - 2]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
if( threadIdx.x >= 4 && idx[threadIdx.x] == idx[threadIdx.x - 4] ) { left = val[threadIdx.x - 4]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
if( threadIdx.x >= 8 && idx[threadIdx.x] == idx[threadIdx.x - 8] ) { left = val[threadIdx.x - 8]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
if( threadIdx.x >= 16 && idx[threadIdx.x] == idx[threadIdx.x - 16] ) { left = val[threadIdx.x - 16]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
if( threadIdx.x >= 32 && idx[threadIdx.x] == idx[threadIdx.x - 32] ) { left = val[threadIdx.x - 32]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
if( threadIdx.x >= 64 && idx[threadIdx.x] == idx[threadIdx.x - 64] ) { left = val[threadIdx.x - 64]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
if( threadIdx.x >= 128 && idx[threadIdx.x] == idx[threadIdx.x - 128] ) { left = val[threadIdx.x - 128]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
if( threadIdx.x >= 256 && idx[threadIdx.x] == idx[threadIdx.x - 256] ) { left = val[threadIdx.x - 256]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
}
// The second level of the segmented reduction operation
global void
spmv_coo_reduce_update_kernel(const IndexType num_warps,
const IndexType * temp_rows,
const ValueType * temp_vals,
ValueType * y)
{
__shared__ IndexType rows[BLOCK_SIZE + 1];
__shared__ ValueType vals[BLOCK_SIZE + 1];
const IndexType end = num_warps - (num_warps & (BLOCK_SIZE - 1));
if (threadIdx.x == 0)
{
rows[BLOCK_SIZE] = (IndexType) -1;
vals[BLOCK_SIZE] = (ValueType) 0;
}
__syncthreads();
IndexType i = threadIdx.x;
while (i < end)
{
// do full blocks
rows[threadIdx.x] = temp_rows[i];
vals[threadIdx.x] = temp_vals[i];
__syncthreads();
segreduce_block(rows, vals);
if (rows[threadIdx.x] != rows[threadIdx.x + 1])
y[rows[threadIdx.x]] += vals[threadIdx.x];
__syncthreads();
i += BLOCK_SIZE;
}
if (end < num_warps){
if (i < num_warps){
rows[threadIdx.x] = temp_rows[i];
vals[threadIdx.x] = temp_vals[i];
} else {
rows[threadIdx.x] = (IndexType) -1;
vals[threadIdx.x] = (ValueType) 0;
}
__syncthreads();
segreduce_block(rows, vals);
if (i < num_warps)
if (rows[threadIdx.x] != rows[threadIdx.x + 1])
y[rows[threadIdx.x]] += vals[threadIdx.x];
}
}
global void
spmv_coo_serial_kernel(const IndexType num_nonzeros,
const IndexType * I,
const IndexType * J,
const ValueType * V,
const ValueType * x,
ValueType * y)
{
for(IndexType n = 0; n < num_nonzeros; n++){
y[I[n]] += V[n] * x[J[n]];
}
}
// prosty algorytm mnozacy na CPU
void
spmv_coo_CPU(const IndexType num_nonzeros,
const IndexType * I,
const IndexType * J,
const ValueType * V,
const ValueType * x,
ValueType * y)
{
IndexType i;
for (i = 0; i < num_nonzeros; i++)
y[I[i]] += V[i] * x[J[i]];
}
int main(int argc, char *argv)
{
int ret_code;
MM_typecode matcode;
FILE *f;
int M, N, nz;
int i, s;
struct timeval tStart, tStop, tInterval;
unsigned int hTimer;
double czas;
IndexType *h_I, *h_J;
ValueType *h_V, *h_x, *h_y, *h_y_cpu;
if (argc < 2)
{
fprintf(stderr, "Uzycie: %s [nazwa-pliku]\n", argv[0]);
exit(1);
}
else
{
if ((f = fopen(argv[1], "r")) == NULL)
exit(1);
}
if (mm_read_banner(f, &matcode) != 0)
{
printf("Could not process Matrix Market banner.\n");
exit(1);
}
/* This is how one can screen matrix types if their application */
/* only supports a subset of the Matrix Market data types. */
if (mm_is_complex(matcode) && mm_is_matrix(matcode) &&
mm_is_sparse(matcode) )
{
printf("Sorry, this application does not support ");
printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));
exit(1);
}
printf(“Loading data…\n”);
if ((ret_code = mm_read_mtx_crd_size(f, &N, &M, &nz)) != 0)
exit(1);
size_t size_index = nz * sizeof(IndexType);
size_t size_value = nz * sizeof(ValueType);
h_I = (IndexType *) malloc(size_index);
h_J = (IndexType *) malloc(size_index);
h_V = (ValueType *) malloc(size_value);
for (i = 0; i < nz; ++i)
{
fscanf(f, "%d %d %lf\n", &h_J[i], &h_I[i], &h_V[i]);
h_I[i]--;
h_J[i]--;
}
if (f != stdin) fclose(f);
size_t size_vec = M * sizeof(ValueType);
h_x = (ValueType *) malloc(size_vec);
for (i = 0; i < M; i++)
h_x[i] = 2.0+i*0.005;
h_y = (ValueType *) calloc(M, sizeof(ValueType));
h_y_cpu = (ValueType *) calloc(M, sizeof(ValueType));
IndexType *d_I, *d_J;
ValueType *d_V, *d_x, *d_y;
cudaMalloc((void**)&d_I, size_index);
cudaMalloc((void**)&d_J, size_index);
cudaMalloc((void**)&d_V, size_value);
cudaMalloc((void**)&d_x, size_vec);
cudaMalloc((void**)&d_y, size_vec);
checkCUDAError(“cudaMalloc”);
cudaMemcpy(d_I, h_I, size_index, cudaMemcpyHostToDevice);
cudaMemcpy(d_J, h_J, size_index, cudaMemcpyHostToDevice);
cudaMemcpy(d_V, h_V, size_value, cudaMemcpyHostToDevice);
cudaMemcpy(d_x, h_x, size_vec, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, size_vec, cudaMemcpyHostToDevice);
checkCUDAError(“cudaMemcpy to device”);
s = M/10;
if (s == 0)
s = 1;
printf(“s = %d\n”, s);
printf(“CPU multiplication…\n”);
cutCreateTimer(&hTimer);
cutResetTimer(hTimer);
cudaThreadSynchronize();
cutStartTimer(hTimer);
spmv_coo_CPU(nz, h_I, h_J, h_V, h_x, h_y_cpu);
cutStopTimer(hTimer);
printf(“Execution time (CPU) = %lf ms\n”, cutGetTimerValue(hTimer));
cutDeleteTimer(hTimer);
printf(“Result:\n”);
for (int k = 0; k < M; k+=s)
printf("y[%d] = %lf\n", k, h_y_cpu[k]);
if (nz == 0)
printf("Empty matrix\n");
else if (nz < WARP_SIZE)
{
printf("Small matrix\n");
spmv_coo_serial_kernel <<<1,1>>>
(nz, d_I, d_J, d_V, d_x, d_y);
cudaMemcpy(h_y, d_y, size_vec, cudaMemcpyDeviceToHost);
printf("Result:\n");
for (i = 0; i < M; ++i)
printf("%lf ", h_y[i]);
printf("\n");
}
else
{
const unsigned int num_units = nz / WARP_SIZE;
const unsigned int num_warps = std::min(num_units, WARPS_PER_BLOCK * MAX_BLOCKS);
const unsigned int num_blocks = DIVIDE_INTO(num_warps, WARPS_PER_BLOCK);
const unsigned int num_iters = DIVIDE_INTO(num_units, num_warps);
const unsigned int interval_size = WARP_SIZE * num_iters;
const IndexType tail = num_units * WARP_SIZE;
const unsigned int active_warps = (interval_size == 0) ? 0 : DIVIDE_INTO(tail, interval_size);
printf(“num_units = %u\n”, num_units);
printf("num_warps = %u\n", num_warps);
printf("num_blocks = %u\n", num_blocks);
printf("num_inters = %u\n", num_iters);
printf("interval_size = %u\n", interval_size);
printf("tail = %d\n", tail);
printf("active_warps = %u\n", active_warps);
if (UseCache)
bind_x(d_x);
checkCUDAError("bind");
IndexType * temp_rows;
ValueType * temp_vals;
cudaMalloc((void**)&temp_rows, active_warps * sizeof(IndexType));
cudaMalloc((void**)&temp_vals, active_warps * sizeof(ValueType));
// Odpalamy kernel
printf("GPU multiplication...\n");
cutilCheckError(cutCreateTimer(&hTimer));
cutilCheckError(cutResetTimer(hTimer));
cudaThreadSynchronize();
cutilCheckError(cutStartTimer(hTimer));
spmv_coo_kernel <<<num_blocks,BLOCK_SIZE>>>
(tail, interval_size, d_I, d_J, d_V, d_x, d_y, temp_rows, temp_vals);
cudaThreadSynchronize();
spmv_coo_serial_kernel <<<1,1>>>
(nz - tail, d_I + tail, d_J + tail, d_V + tail, d_x, d_y);
cudaThreadSynchronize();
spmv_coo_reduce_update_kernel <<<1, 512>>>
(active_warps, temp_rows, temp_vals, d_y);
cudaThreadSynchronize();
cutilCheckError(cutStopTimer(hTimer));
checkCUDAError("wywoanie kerneli");
printf("Execution time (GPU) = %lf ms\n", cutGetTimerValue(hTimer));
cutDeleteTimer(hTimer);
cudaFree(temp_rows);
cudaFree(temp_vals);
if (UseCache)
unbind_x(d_x);
cudaMemcpy(h_y, d_y, size_vec, cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy to host");
printf(“Result:\n”);
for (int k = 0; k < M; k+=s)
printf("y[%d] = %lf\n", k, h_y[k]);
}
free(h_I);
free(h_J);
free(h_V);
free(h_x);
free(h_y);
free(h_y_cpu);
cudaFree(d_I);
cudaFree(d_J);
cudaFree(d_V);
cudaFree(d_x);
cudaFree(d_y);
return 0;
}
[/codebox]
Results of this programme are strange. CPU seems to be faster than GPU. Example results:
$ release/spmv ../mtx/hood.mtx
Loading data...
s = 22054
CPU multiplication...
Execution time (CPU) = 40.953999 ms
Result:
y[0] = 1063117684.283157
y[22054] = 12640207895.224644
y[44108] = -33255447.632059
y[66162] = 868046245.778653
y[88216] = 261818029.206334
y[110270] = 222523762.127168
y[132324] = 1407517451.698023
y[154378] = 327192007.079969
y[176432] = 4373661739.092607
y[198486] = -66860974.932954
y[220540] = 2416760813.382476
num_units = 171702
num_warps = 8
num_blocks = 1
num_inters = 21463
interval_size = 686816
tail = 5494464
active_warps = 8
GPU multiplication...
Execution time (GPU) = 86.750999 ms
Result:
y[0] = 1063117684.283157
y[22054] = 12640207895.224644
y[44108] = -33255447.632060
y[66162] = 868046245.778653
y[88216] = 261818029.206334
y[110270] = 222523762.127168
y[132324] = 1407517451.698022
y[154378] = 327192007.079969
y[176432] = 4373661739.092607
y[198486] = -66860974.932954
y[220540] = 2416760813.382476
GPU is over 2 times slower. Why? What am I doing wrong? My graphic card is GeForce GTX 275.