Unknown error accumulating large array

Hello to everybody. I’m designing some kind of an extended-STL framework with CUDA implementation of algorithms. I’m experiencing a strange kind of error which occurs only accumulating a large array. I’m trying to perform a dot product between a matrix and a vector - two classic arrays, actually - in 3 steps:

matrix m, vector v, result-vector r, temp-result-matrix t_r

  1. t_r[i*rowsize+j] = m[i*rowsize+j] * v[j] //point-point product - 1st kernel

  2. t_r[i*rowsize+0] = /sum over j of t_r[i*rowsize+j]/ //accumulation of rows - 2nd kernel

  3. r[i] = t_r[i*rowsize+j] //copy step - 3rd kernel

Being the matrix, say, a x b - actually a 1D array, but however - everything works fine with a large ‘a’ (up to 1million) and small ‘b’ (up to 10) but, using a small ‘a’ and a not-so-large ‘b’, problems appear when ‘b’ reaches 10,000 circa - not too much, particularly, threads I create are less than BLOCK_SIZE*65535. Visual Studio 2008 compiles everything - obviously - but trying to execute program stops suddenly with this error:

Cuda error in file ‘c:/Users/ASUS/Documents/Visual Studio 2008/Projects/parallel_library/code/src/src_CUDA/algorithm_p_CUDA.cu’ in line 567 : unknown error.

“line 567” is a CUDA_SAFE_CALL(cudaThreadSynchronize()) after 2nd step row-accumulation.

I tried to put some cudaGetLastError() but I always get back a “cudaSuccess”…

Well, here is a simplified version of my code - tested & not working!:

// main.cpp

int n = 20000;

matrix_p<int> m(2, n, 1); // 2 x n, all elements 1

vector_p<int> v(n, 2); // n elements of value 2

vector_p<int> r(2); // 2 elements uninizialized

dot_product_p<int>(m, v, r);

// algorithm_p.h

template <typename T>

void dot_product_p(matrix_p<T>& mat, vector_p<T>& vet, vector_p<T>& res_vet)

{

	dot_product_mat_vect_CUDA_temp(mat.device_pointer(), vet.device_pointer(), res_vet.device_pointer(), mat.row_size(), mat.col_size());

}

//algorithm_CUDA.cu

template <typename T>

void dot_product_mat_vect_CUDA_temp(T* mat, T* vet, T* res, int rowsize, int colsize)

{

	int block_size = 512;		

	T* res_mat = cuda_alloc_dev<T>(rowsize*colsize);		

	dim3 threads_per_block(block_size, 1, 1);		

	dim3 blocks_per_grid((rowsize*colsize + block_size - 1)/block_size, 1, 1);			

	point_point_matvet_product_kernel<<<blocks_per_grid,threads_per_block>>>(mat, vet, res_mat, rowsize, colsize);			

	CUDA_SAFE_CALL( cudaThreadSynchronize() );

	int k = 16;

	int threads_per_row = ceilf( (float)rowsize/(float)k );

	int n_threads = threads_per_row * colsize;

	float rapp = logf(rowsize)/logf(k);

	int p = ceilf(rapp); // p = ceil(log_k(n)) = ceil(log(n) / log(k))

	for(int j=1; j<=p; ++j)

	{				

		threads_per_row = ceilf( rowsize/powf(k,j) );

		n_threads = threads_per_row * colsize;

		dim3 blocks_per_grid2((n_threads+block_size-1)/block_size, 1, 1);

		accumulate_mat_rows_kernel<<<blocks_per_grid2,threads_per_block>>>(res_mat,rowsize,colsize,j,k,threads_per_row);	

		CUDA_SAFE_CALL( cudaThreadSynchronize() );

	}

	dim3 blocks_per_grid3((colsize + block_size - 1)/block_size, 1, 1);	

	recover_data_kernel<<<blocks_per_grid3,threads_per_block>>>(res_mat,res,rowsize,colsize);

	CUDA_SAFE_CALL( cudaThreadSynchronize() );

	CUDA_SAFE_CALL( cudaFree(res_mat) );

}

// kernels.cu

template <typename T>

__global__ void point_point_matvet_product_kernel(T* mat, T* vet, T* res_mat, int rowsize, int colsize)

{

	int index = blockIdx.x * blockDim.x + threadIdx.x;

	if( index < rowsize*colsize)

	{

		res_mat[index] = mat[index] * vet[index % rowsize];

	}

}

// ...

template <typename T>

__global__ void accumulate_mat_rows_kernel(T* m, int rowsize, int colsize, int j, int k, int tpr)

{

		

	int POW2 = 1;

	// POW2 = powf(k,j-1) - less accurate than explicit power

	for(int l=0; l<j-1; l++)

		POW2 *= k;

	int POW1 = k*POW2; // POW1 = powf(k,j)

	int index_in_row = (blockIdx.x * blockDim.x + threadIdx.x) % tpr;

	int ind = (blockIdx.x * blockDim.x + threadIdx.x) / tpr;

	int index1 = index_in_row*POW1;	

	int index2;

	for(int i=1; i<k; ++i)

	{

		index2 = index1 + i*POW2;			

		if(index2 < rowsize)

			m[ind*rowsize + index1] += m[ind*rowsize + index2];

	}		

}

To perform rows accumulation, I used some kind of a logarithmic sum where:

k = 16; // number of elements accumulated per-thread

p = log_k( rowsize ); //number of steps - "passi" in italian :)

for(int j=1; j<=p; ++j)

{

    for(int el=1; el<k; ++el)

        vett[k^j] += vett[k^j + (k^(j-1))*el]

}

Ok, that’s all. Thanks to anybody would be so nice to answer.

So… No one answers!? External Image