Hallo,
I got a problem. In the second kernel “DotKernel”, I can’t change the values of any shared array or global array.
import pycuda.autoinit
from pycuda import driver, compiler, gpuarray, tools
from pycuda.compiler import SourceModule
import numpy as np
from time import *
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import time
from scipy import signal
from numba import jit
import numba as nb
kernel_code_template = """
#include <cuComplex.h>
__global__ void OuterKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *BETA2, cuFloatComplex *C)
{
const uint wB = %(MATRIX_SIZE_O)d;
const uint bx = blockIdx.x;
const uint by = blockIdx.y;
const uint tx = threadIdx.x;
const uint ty = threadIdx.y;
__shared__ cuFloatComplex A_y ;
__shared__ cuFloatComplex B_y ;
__shared__ cuFloatComplex beta2;
__shared__ cuFloatComplex C_add;
const uint c = wB *by + bx;
for (int index_r = 0; index_r < %(rSize)d; index_r ++)
{
for (int wA = 1; wA < %(tSize)d+1; wA ++)
{
const uint aBegin = %(tSize)d * %(BLOCK_SIZE)d * by;
const uint aEnd = aBegin + wA -1 ;
const uint aStep = %(BLOCK_SIZE)d;
const int bBegin = %(tSize)d * %(BLOCK_SIZE)d * bx;
const uint bStep = %(BLOCK_SIZE)d ;
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{
A_y = make_cuFloatComplex(cuCrealf(A[a + wB*ty + tx]),cuCimagf(A[a + wB*ty + tx]));
B_y = make_cuFloatComplex(cuCrealf(B[b + wB*ty + tx]),cuCimagf(B[b + wB*ty + tx]));
C_add= cuCmulf(A_y,B_y);
}
beta2 = make_cuFloatComplex(cuCrealf(BETA2[%(tSize)d * index_r+wA-1]),cuCimagf(BETA2[%(tSize)d * index_r+wA-1]));
C_add = cuCmulf(C_add, beta2);
C[c] = cuCaddf(C[c], C_add);
}
}
}
__global__ void DotKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *BETA, cuFloatComplex *C, cuFloatComplex *FFT,cuFloatComplex *CC)
{
const uint wA = %(MATRIX_SIZE_I)d;
const uint wB = %(MATRIX_SIZE_O)d;
const uint bx = blockIdx.x;
const uint by = blockIdx.y;
const uint aBegin = wA * %(BLOCK_SIZE)d * by;
const uint aEnd = aBegin + wA - 1;
const uint aStep = 1;
const int bBegin = %(BLOCK_SIZE)d * bx;
const uint bStep = wB;
cuFloatComplex Csub = make_cuFloatComplex(0,0);
cuFloatComplex Csub_1 = make_cuFloatComplex(0,0);
cuFloatComplex Csub_2 = make_cuFloatComplex(0,0);
__shared__ cuFloatComplex A_global[%(MATRIX_SIZE_I)d];
__shared__ cuFloatComplex C1[%(MATRIX_SIZE_I)d];
for (int index_r = 0; index_r < %(rSize)d; index_r ++)
{
for (int index_t = 0; index_t < %(tSize)d; index_t ++)
{
for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
{
A_global[i] = make_cuFloatComplex(cuCrealf(B[i*%(tSize)d + index_t]),cuCimagf(B[i*%(tSize)d + index_t]));
}
__syncthreads();
Csub = make_cuFloatComplex(0,0);
for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep)
{
Csub = cuCaddf(Csub,cuCmulf(A_global[a], C[b]));
__syncthreads();
}
const uint c = wB * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx;
C1[c] = make_cuFloatComplex(cuCrealf(Csub), cuCimagf(Csub));
__syncthreads();
for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
{
A_global[i] = make_cuFloatComplex(0,0);
A_global[i] = make_cuFloatComplex(cuCrealf(FFT[i*%(rSize)d + index_r]),cuCimagf(FFT[i*%(rSize)d + index_r]));
}
CC[c] = make_cuFloatComplex(0,0);
__syncthreads();
CC[c] = cuCmulf(C1[c],A_global[c]);
Csub_1 = cuCmulf(Csub,A_global[c]);
__syncthreads();
BETA[index_r * %(tSize)d + index_t] = make_cuFloatComplex(0,0);
Csub_1 = make_cuFloatComplex(0,0);
for (int i = 0; i<%(MATRIX_SIZE_I)d; i+=1)
{
Csub_1 = cuCaddf(Csub_1, CC[i]);
//BETA[index_r* %(tSize)d + index_t] = cuCaddf(BETA[index_r * %(tSize)d + index_t],CC[i]);
}
for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
{
A_global[i] = make_cuFloatComplex(0,0);
A_global[i] = make_cuFloatComplex(cuCrealf(A[i*%(tSize)d + index_t]),cuCimagf(A[i*%(tSize)d + index_t]));
}
CC[c] = make_cuFloatComplex(0,0);
__syncthreads();
//CC[c] = make_cuFloatComplex(cuCrealf(A_global[c]),cuCimagf(A_global[c]));
CC[c] = cuCmulf(C1[c],A_global[c]);
__syncthreads();
BETA[index_r * %(tSize)d + index_t] = make_cuFloatComplex(0,0);
Csub_2 = make_cuFloatComplex(0,0);
for (int i = 0; i<%(MATRIX_SIZE_I)d; i+=1)
{
Csub_2 = cuCaddf(Csub_2, CC[i]);
//BETA[index_r* %(tSize)d + index_t] = cuCaddf(BETA[index_r * %(tSize)d + index_t],CC[i]);
}
//BETA[index_r* %(tSize)d + index_t] = make_cuFloatComplex(cuCrealf(Csub_2), cuCimagf(Csub_2));
BETA[index_r* %(tSize)d + index_t] = cuCdivf(Csub_1,Csub_2);
}
}
}
"""
tic = time.time()
iaa_iter = 4
c = 3e8
fc = 3.15e9
lam = c/fc
Mt = 7
Mr = 5
rmax = 300
theta = (np.arange(-80, 80, 0.5))/360*2*np.pi
r = np.arange(0, rmax, rmax/100)
tsize = theta.size
rsize = r.size
fmcw_fft = np.arange(0,Mt*Mr*theta.size) + 1j*1
fmcw_fft = fmcw_fft.reshape(Mt*Mr, theta.size).astype(np.complex64)
VX = np.zeros(Mt*Mr)
for i in range(0, Mt*Mr+1, 1):
VX[i-1] = i*lam/2
B = np.zeros((Mt*Mr, theta.size), dtype=complex)
for index_theta in range(0, theta.size, 1):
alpha = np.exp(1j*2*np.pi*fc*np.sin(theta[index_theta])/c*VX)
if iaa_iter == 0:
B[:, index_theta] = np.multiply(alpha, signal.nuttall(Mr*Mt))
else:
B[:, index_theta] = alpha
beta = np.zeros((r.size, theta.size), dtype=complex)
for index_r in range(0, r.size):
for index_theta in range(0, theta.size):
beta[index_r, index_theta] = np.dot(np.conj(B[:, index_theta]), fmcw_fft[:, index_r]) / np.dot(np.conj(B[:, index_theta]), B[:, index_theta])
RZero = np.zeros((Mr*Mt, Mr*Mt), dtype=complex)
beta = beta.astype(np.complex64)
BETA2 = (np.square(np.abs(beta)) + 1j*0).astype(np.complex64)
Bcon = np.conj(B)
MATRIX_SIZE_O = 35
MATRIX_SIZE_I = 35
BLOCK_SIZE = 1
GRID_X = MATRIX_SIZE_O
GRID_Y = MATRIX_SIZE_O
B = np.real(B).astype(np.float16) + 1j*np.imag(B).astype(np.float16)
a_cpu = B
Bcon = np.real(Bcon).astype(np.float16) + 1j*np.imag(Bcon).astype(np.float16)
b_cpu = Bcon
FFT_cpu = np.real(fmcw_fft).astype(np.float16) + 1j*np.imag(fmcw_fft).astype(np.float16)
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
BETA2_gpu = gpuarray.to_gpu(BETA2)
c_gpu = gpuarray.empty((MATRIX_SIZE_O, MATRIX_SIZE_O), np.complex64)
FFT_gpu = gpuarray.to_gpu(FFT_cpu)
kernel_code = kernel_code_template % {
'MATRIX_SIZE_I': MATRIX_SIZE_I,
'MATRIX_SIZE_O': MATRIX_SIZE_O,
'BLOCK_SIZE': BLOCK_SIZE,
'tSize': tsize,
'rSize': rsize,
}
mod = compiler.SourceModule(kernel_code)
outer = mod.get_function("OuterKernel")
t1 = time.time()
outer(
a_gpu, b_gpu, BETA2_gpu,
c_gpu,
grid = (GRID_X, GRID_Y),
block = (BLOCK_SIZE, BLOCK_SIZE, 1),
)
t2 = time.time()
t_gpu = t2-t1
print('INFORMATION:',pycuda.driver.mem_get_info())
print('t_gpu1 = ', t_gpu)
aa_gpu = gpuarray.to_gpu(a_cpu)
bb_gpu = gpuarray.to_gpu(b_cpu)
BETA2_gpu = gpuarray.empty((rsize, tsize), np.complex64)
rinv_gpu = gpuarray.empty((MATRIX_SIZE_O, MATRIX_SIZE_O), np.complex64)
rinv_gpu = gpuarray.to_gpu(c_gpu.get())
cc_gpu = gpuarray.empty((1,MATRIX_SIZE_O), np.complex64)
mod = compiler.SourceModule(kernel_code)
dot = mod.get_function("DotKernel")
t1 = time.time()
dot(
aa_gpu,
bb_gpu,
BETA2_gpu,
rinv_gpu,
FFT_gpu,
cc_gpu,
grid = (GRID_X, GRID_Y),
block = (BLOCK_SIZE, BLOCK_SIZE, 1),)
t2 = time.time()
t_gpu = t2-t1
print('t_gpu2 = ', t_gpu)
beta = BETA2_gpu.get()
toc = time.time()
print(toc-tic)
beta[1:12, :] = 0
plt.contourf(np.outer(r, np.sin(theta)), np.outer(r, np.cos(theta)), 20*np.log10(np.abs(beta)))
plt.show()