Hi,
I am a beginner to cuda programming and I do not know much about cuda. I have a program which passes a 2D matrix to device and I need to extract each column of it in parallel. I wrote a code but it does not give the correct answer.
The program reads the norm.txt into M matrix in FileRead() and convert this ELLPACK sparse format using ellpack() into val,col, and rL matrices.
The column_k_extract() kernel is to extract each column of M from col,val, and rL. But I can’t access the data using indices. When the value is printed it shows some other values like column[0]=166602288. But the actual value is 0.
I can’t find the error. I think it may be the indexing of matrices, not sure. Because the host data is in 2D and device data is in 1D.
The code is :
#define N 12
#define Q 5
float **M, **val,*u;
int **col, *rL,mx=0;
const int threadsPerBlock = 256;
const int blocksPerGrid = imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
__global__ void colomn_k_extract(float*data, int*column, int*rowL, int ncols){
int dim =blockDim.x; int bid=blockIdx.x; int tid=threadIdx.x; int row=dim*bid+tid;
float v[N];
if (row<N) {
int ma,i,j=0;
while(j<N) {
i=0; ma=rowL[j];
while(i<ma) {
int pos=j*ncols+i;
if(column[pos]==row) {
v[j]=data[pos]; i=ma;
}
else {
v[j]=0.0; i+=1;
}}
j++;
}
__syncthreads();
}}
int main(void){
float *dev_M, *dev_val, *dev_u, *dev_v; int *dev_col, *dev_rL,*ncols; int i, j;
M = Make2DFloatArray(N,N); val = Make2DFloatArray(N,Q); col = Make2DIntArray(N,Q);
rL = (int*) malloc(N*sizeof(int));
FileRead(); ellpack();
cudaMalloc((void**) &dev_M, sizeof(float) * N*N);
cudaMalloc((void**) &dev_val, sizeof(float) * N*Q);
cudaMalloc((void**) &dev_col, sizeof(int) * N*Q);
cudaMalloc((void**) &dev_rL, sizeof(int) * N);
cudaMalloc((void**) &dev_u, sizeof(float) * N);
cudaMalloc((void**) &dev_v, sizeof(float) * N);
cudaMalloc((void**) &ncols, sizeof(int));
cudaMemcpy(dev_M, M,sizeof(float)*N*N, cudaMemcpyHostToDevice);
cudaMemcpy(dev_val, val,sizeof(float) * N*Q, cudaMemcpyHostToDevice);
cudaMemcpy(dev_col, col,sizeof(int) * N*Q, cudaMemcpyHostToDevice);
cudaMemcpy(dev_rL, rL,sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemcpy(ncols, &mx,sizeof(int), cudaMemcpyHostToDevice);
colomn_k_extract<<<blocksPerGrid,threadsPerBlock>>>( dev_val, dev_col, dev_rL, mx, dev_u );
cudaMemcpy(u,dev_u,sizeof(int) * N, cudaMemcpyHostToDevice);
cudaFree(dev_M); cudaFree(dev_val);cudaFree(dev_col); cudaFree(dev_rL);
for (i = 0; i < N; i++){
free(M[i]);
}free(M);
for (i = 0; i < N; i++){
free(val[i]);
}free(val);
for (i = 0; i < N; i++){
free(col[i]);
}free(col);
free(rL);
return 0;
}
please help me to find the error.
Thank you in advance
norm.txt (421 Bytes)
valELL.txt (763 Bytes)
ell.cu (3.17 KB)