Hallo,

I have a piece of very simple code written in Pycuda. Can someone tell me, why shouldn’t I set the index of array CC as “c = wA * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx”? For example, if I set the index of CC as 1 or 2 or 3, it can get the right value.

```
import pycuda.autoinit
from pycuda import driver, compiler, gpuarray, tools
from pycuda.compiler import SourceModule
import numpy as np
kernel_code_template = """
#include <cuComplex.h>
__device__ void ExtractVector(cuFloatComplex *mul_a, int &idt, cuFloatComplex *A){
for (int i = 0; i < %(MATRIX_SIZE_O)d; i ++)
{
A[i] = make_cuFloatComplex(0,0);
A[i] = make_cuFloatComplex(cuCrealf(mul_a[i*%(tSize)d + idt]),cuCimagf(mul_a[i*%(tSize)d + idt]));
}
__syncthreads();
}
__global__ void MatrixMulKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *CC)
{
const uint wA = %(MATRIX_SIZE_O)d;
const uint wB = %(MATRIX_SIZE_O)d;
// Block index
const uint bx = blockIdx.x;
const uint by = blockIdx.y;
// Thread index
const uint tx = threadIdx.x;
const uint ty = threadIdx.y;
cuFloatComplex Csub = make_cuFloatComplex(0,0);
__shared__ cuFloatComplex VectorA[%(MATRIX_SIZE_O)d];
__shared__ cuFloatComplex C1[%(MATRIX_SIZE_O)d];
const uint aBegin = wA * %(BLOCK_SIZE)d * by;
const uint aEnd = aBegin + wA - 1;
const uint aStep = 1;
const int bBegin = %(BLOCK_SIZE)d * bx;
const uint bStep = wB;
const uint c = wA * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx ;
for (int index_r = 0; index_r < %(rSize)d; index_r ++)
{
for (int index_t = 0; index_t < %(tSize)d; index_t ++)
{
ExtractVector(A, index_t, VectorA);
for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep)
{
Csub = cuCaddf(Csub,cuCmulf(VectorA[a], B[b]));
__syncthreads();
}
CC[bx] = make_cuFloatComplex(cuCrealf(Csub), cuCimagf(Csub));
__syncthreads();
}
}
}
"""
theta = (np.arange(-80, 80, 40))/360*2*np.pi
tsize = theta.size
rmax = 300
r = np.arange(0, rmax, rmax/3)
rsize = r.size
Mt = 1
Mr = 6
MATRIX_SIZE_O = Mt*Mr
MATRIX_SIZE_I = 1
BLOCK_SIZE = 1
BLOCK_SIZE_x = 1
BLOCK_SIZE_y = 1
GRID_SIZE = MATRIX_SIZE_O
kernel_code = kernel_code_template % {
'MATRIX_SIZE_O': MATRIX_SIZE_O,
'BLOCK_SIZE': BLOCK_SIZE,
'tSize': tsize,
'rSize': rsize,
}
a_cpu = 1 + 1j*1 + np.arange(0, tsize * MATRIX_SIZE_O)
a_cpu = a_cpu.reshape(MATRIX_SIZE_O,tsize).astype(np.complex64)
b_cpu = 1 + 1j*2 + np.arange(0, MATRIX_SIZE_O * MATRIX_SIZE_O)
b_cpu = b_cpu.reshape(MATRIX_SIZE_O,MATRIX_SIZE_O).astype(np.complex64)
bb_cpu = 1 + 1j*3 + np.arange(0, tsize * MATRIX_SIZE_O)
bb_cpu = bb_cpu.reshape(MATRIX_SIZE_O, tsize).astype(np.complex64)
f_cpu = 1 + 1j*4 + np.arange(0, rsize * MATRIX_SIZE_O)
f_cpu = f_cpu.reshape(MATRIX_SIZE_O,rsize).astype(np.complex64)
Bcon = np.real(a_cpu).astype(np.float32) + 1j*np.imag(a_cpu).astype(np.float32)
a_gpu = gpuarray.to_gpu(Bcon)
Rinv = np.real(b_cpu).astype(np.float32) + 1j*np.imag(b_cpu).astype(np.float32)
b_gpu = gpuarray.to_gpu(Rinv)
FFT_cpu = np.real(f_cpu).astype(np.float32) + 1j*np.imag(f_cpu).astype(np.float32)
f_gpu = gpuarray.to_gpu(FFT_cpu)
B = np.real(bb_cpu).astype(np.float32) + 1j*np.imag(bb_cpu).astype(np.float32)
bb_gpu = gpuarray.to_gpu(B)
beta_gpu = gpuarray.empty((rsize,tsize), np.complex64)
c_gpu = gpuarray.empty((MATRIX_SIZE_O), np.complex64)
beta_cpu = np.zeros(shape=(rsize,tsize)).astype(np.complex64)
for index_r in range (0, rsize):
for index_theta in range(0 , theta.size):
c_cpu = np.dot(a_cpu[:, index_theta], b_cpu)
beta_cpu[index_r, index_theta] = np.dot(c_cpu, f_cpu[:, index_r])
#/ (np.dot(c_cpu, bb_cpu[:, index_theta]))
#beta_cpu[index_r, index_theta] = np.dot(c_cpu, bb_cpu[:, index_theta])
mod = compiler.SourceModule(kernel_code)
matrixmul = mod.get_function("MatrixMulKernel")
matrixmul(
# inputs
a_gpu,
b_gpu,
# outputs
c_gpu,
# grid of multiple blocks
grid = (GRID_SIZE, GRID_SIZE),
# block of multiple threads
block = (BLOCK_SIZE_x , BLOCK_SIZE_y , 1)
)
# print the results
print("-" * 80)
print("Matrix A (GPU): ")
print(a_gpu.get())
print("-" * 80)
print("Matrix B (GPU): ")
print(bb_gpu.get())
print("-" * 80)
print("Matrix C (GPU): ")
print(c_gpu.get())
print("-" * 80)
print("Matrix C (CPU): ")
print(c_cpu)
print("-" * 80)
print("Matrix beta (GPU): ")
print(beta_gpu.get())
print("-" * 80)
print("Matrix beta (CPU): ")
print(beta_cpu)
print("-" * 80)
print("CPU-GPU Difference: ")
print(beta_cpu/beta_gpu.get())
```