Hello to everybody. I’m designing some kind of an extended-STL framework with CUDA implementation of algorithms. I’m experiencing a strange kind of error which occurs only accumulating a large array. I’m trying to perform a dot product between a matrix and a vector - two classic arrays, actually - in 3 steps:
matrix m, vector v, result-vector r, temp-result-matrix t_r
-
t_r[i*rowsize+j] = m[i*rowsize+j] * v[j] //point-point product - 1st kernel
-
t_r[i*rowsize+0] = /sum over j of t_r[i*rowsize+j]/ //accumulation of rows - 2nd kernel
-
r[i] = t_r[i*rowsize+j] //copy step - 3rd kernel
Being the matrix, say, a x b - actually a 1D array, but however - everything works fine with a large ‘a’ (up to 1million) and small ‘b’ (up to 10) but, using a small ‘a’ and a not-so-large ‘b’, problems appear when ‘b’ reaches 10,000 circa - not too much, particularly, threads I create are less than BLOCK_SIZE*65535. Visual Studio 2008 compiles everything - obviously - but trying to execute program stops suddenly with this error:
Cuda error in file ‘c:/Users/ASUS/Documents/Visual Studio 2008/Projects/parallel_library/code/src/src_CUDA/algorithm_p_CUDA.cu’ in line 567 : unknown error.
“line 567” is a CUDA_SAFE_CALL(cudaThreadSynchronize()) after 2nd step row-accumulation.
I tried to put some cudaGetLastError() but I always get back a “cudaSuccess”…
Well, here is a simplified version of my code - tested & not working!:
// main.cpp
int n = 20000;
matrix_p<int> m(2, n, 1); // 2 x n, all elements 1
vector_p<int> v(n, 2); // n elements of value 2
vector_p<int> r(2); // 2 elements uninizialized
dot_product_p<int>(m, v, r);
// algorithm_p.h
template <typename T>
void dot_product_p(matrix_p<T>& mat, vector_p<T>& vet, vector_p<T>& res_vet)
{
dot_product_mat_vect_CUDA_temp(mat.device_pointer(), vet.device_pointer(), res_vet.device_pointer(), mat.row_size(), mat.col_size());
}
//algorithm_CUDA.cu
template <typename T>
void dot_product_mat_vect_CUDA_temp(T* mat, T* vet, T* res, int rowsize, int colsize)
{
int block_size = 512;
T* res_mat = cuda_alloc_dev<T>(rowsize*colsize);
dim3 threads_per_block(block_size, 1, 1);
dim3 blocks_per_grid((rowsize*colsize + block_size - 1)/block_size, 1, 1);
point_point_matvet_product_kernel<<<blocks_per_grid,threads_per_block>>>(mat, vet, res_mat, rowsize, colsize);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
int k = 16;
int threads_per_row = ceilf( (float)rowsize/(float)k );
int n_threads = threads_per_row * colsize;
float rapp = logf(rowsize)/logf(k);
int p = ceilf(rapp); // p = ceil(log_k(n)) = ceil(log(n) / log(k))
for(int j=1; j<=p; ++j)
{
threads_per_row = ceilf( rowsize/powf(k,j) );
n_threads = threads_per_row * colsize;
dim3 blocks_per_grid2((n_threads+block_size-1)/block_size, 1, 1);
accumulate_mat_rows_kernel<<<blocks_per_grid2,threads_per_block>>>(res_mat,rowsize,colsize,j,k,threads_per_row);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
}
dim3 blocks_per_grid3((colsize + block_size - 1)/block_size, 1, 1);
recover_data_kernel<<<blocks_per_grid3,threads_per_block>>>(res_mat,res,rowsize,colsize);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUDA_SAFE_CALL( cudaFree(res_mat) );
}
// kernels.cu
template <typename T>
__global__ void point_point_matvet_product_kernel(T* mat, T* vet, T* res_mat, int rowsize, int colsize)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if( index < rowsize*colsize)
{
res_mat[index] = mat[index] * vet[index % rowsize];
}
}
// ...
template <typename T>
__global__ void accumulate_mat_rows_kernel(T* m, int rowsize, int colsize, int j, int k, int tpr)
{
int POW2 = 1;
// POW2 = powf(k,j-1) - less accurate than explicit power
for(int l=0; l<j-1; l++)
POW2 *= k;
int POW1 = k*POW2; // POW1 = powf(k,j)
int index_in_row = (blockIdx.x * blockDim.x + threadIdx.x) % tpr;
int ind = (blockIdx.x * blockDim.x + threadIdx.x) / tpr;
int index1 = index_in_row*POW1;
int index2;
for(int i=1; i<k; ++i)
{
index2 = index1 + i*POW2;
if(index2 < rowsize)
m[ind*rowsize + index1] += m[ind*rowsize + index2];
}
}
To perform rows accumulation, I used some kind of a logarithmic sum where:
k = 16; // number of elements accumulated per-thread
p = log_k( rowsize ); //number of steps - "passi" in italian :)
for(int j=1; j<=p; ++j)
{
for(int el=1; el<k; ++el)
vett[k^j] += vett[k^j + (k^(j-1))*el]
}
Ok, that’s all. Thanks to anybody would be so nice to answer.