Hi,
I am a beginner in CUDA programming.I have to use shared memory. In my task I have 145 vectors with many thousands of elements. There vectors are written in a one-dimensional array. The result must be a scalar product of all vectors. I can’t load data into buffer. I should load warp with 32 elements for each vector and then make the product. Can you show me what I’m doing wrong?
Here is my code
global void scalar_13(real *Mat, int chunk_size, int Len, int Size, real *resMat ){
int s_x = threadIdx.x + blockIdx.x * blockDim.x;
int s_y = threadIdx.y + blockIdx.y * blockDim.y;
int i;
int ad_x;
int ad_y;
double res;
int th_num = threadIdx.y*blockDim.x+threadIdx.x;
int iter = Len/chunk_size;
int remain, start_x, start_y;
extern __shared__ float buffer[];
res = 0.0;
iter = Len / chunk_size;
remain = Len % chunk_size;
int n;
ad_x = s_x*Len; //beginning of the vector X
ad_y = s_y*Len; //beginning of the vector Y
int offset_x = threadIdx.x * chunk_size;
int offset_y = blockDim.x * chunk_size + threadIdx.y * chunk_size;
if ( (s_x<Size) & (s_y<Size) ) {
for (n=0;n<iter;n++) {
if (th_num < 32) {
for (int ii=0;ii<blockDim.x;ii++) {
buffer[ii * chunk_size + th_num] = Mat[ad_x + n * chunk_size + th_num];
}
for (int ii=0;ii<blockDim.y;ii++) {
buffer[(blockDim.x + ii) * chunk_size + th_num] = Mat[ad_y + n * chunk_size + th_num];
}
}
__syncthreads();
for (int i=0; i<chunk_size; i++) {
res+= buffer[offset_x+i]*buffer[offset_y+i];
ad_x++;
ad_y++;
}
}
}
__syncthreads();
if ( (s_x<Size) & (s_y<Size) ) {
for (int ii=0;ii<blockDim.x;ii++) {
buffer[offset_x + th_num + ii * chunk_size] = Mat[ad_x + iter * chunk_size + th_num];
}
for (int ii=0;ii<blockDim.y;ii++) {
buffer[offset_y + th_num + ii * chunk_size] = Mat[ad_y + iter * chunk_size + th_num];
}
__syncthreads();
for (int i=0; i<remain; i++) {
res+= buffer[offset_x+i]*buffer[offset_y+i];
ad_x++;
ad_y++;
}
}
__syncthreads();
resMat[s_x*Size+s_y] = res;
__syncthreads();
}
Thanks