Hey everyone,
I’m still getting the hang of CUDA, and I am currently writing my first proper program in it, but when I have made the kernel, I try to compile it and get the following two errors:
Error 1 error LNK2019: unresolved external symbol “void __cdecl kernel(float,float,float,float,float,float)” (?kernel@@YAXMMMMMM@Z) referenced in function main main.cu.obj GPMAD Cuda Section
Error 2 fatal error LNK1120: 1 unresolved externals C:\Users\hamnett\Desktop\GPMAD Cuda Section\x64\Debug\GPMAD Cuda Section.exe GPMAD Cuda Section
The kernel is shown as a prototype at the top of the code as such:
__global__ void kernel(float x, float p_x, float y, float p_y, float t, float p_t);
and is mentioned in main() as this:
unsigned int num_threads = 256; //The number of threads per block
unsigned int num_blocks = no_particles/num_threads; //The number of blocks
dim3 grid( num_blocks, 1, 1);
dim3 threads( num_threads, 1, 1);
//Execute kernel
kernel<<<grid,threads>>>(*device_particle_x, *device_particle_p_x, *device_particle_y, *device_particle_p_y, *device_particle_t, *device_particle_p_t);
and finally, the actual kernel code is this:
//This is the kernel
__global__ void kernel(float* x, float* p_x, float* y, float* p_y, float* t, float* p_t) {
const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Define the transport matrix R
float R[6][6] = {0}; //initialise the transport matrix R to all zero values
float Beta = 0.8; //Assign a value to relativistic Beta (Obviously this will be determined by program at a later stage)
float Gamma = 1.666; //As above for relativistic Gamma
float L = 100.0; //Length of the drift space
//To start with, we need to change the value of R to match a purpose, as at the moment it is set to all zeros
//For now, we will make it equal to the drift matrix
for(int j = 0; j < 5; ++j){ //Remember that the first matrix element begins at zero so the last is the 5th element
R[j][j] = 1.0; //Sets the diagonals to 1
}
R[1][0] = R[3][2] = L;
R[5][4] = L/(Beta*Beta*Gamma*Gamma);
//Now we need to create temporary variable arrays to store the results before returning them to the host
float X, P_X, Y, P_Y, T, P_T;
//Perform Matrix Multiplication of R matrix with particle elements for this thread index
X = R[0][0]*x[tid] + R[1][0]*p_x[tid] + R[2][0]*y[tid] + R[3][0]*p_y[tid] + R[4][0]*t[tid] + R[5][0]*p_t[tid];
P_X = R[0][1]*x[tid] + R[1][1]*p_x[tid] + R[2][1]*y[tid] + R[3][1]*p_y[tid] + R[4][1]*t[tid] + R[5][1]*p_t[tid];
Y = R[0][2]*x[tid] + R[1][2]*p_x[tid] + R[2][2]*y[tid] + R[3][2]*p_y[tid] + R[4][2]*t[tid] + R[5][2]*p_t[tid];
P_Y = R[0][3]*x[tid] + R[1][3]*p_x[tid] + R[2][3]*y[tid] + R[3][3]*p_y[tid] + R[4][3]*t[tid] + R[5][3]*p_t[tid];
T = R[0][4]*x[tid] + R[1][4]*p_x[tid] + R[2][4]*y[tid] + R[3][4]*p_y[tid] + R[4][4]*t[tid] + R[5][4]*p_t[tid];
P_T = R[0][5]*x[tid] + R[1][5]*p_x[tid] + R[2][5]*y[tid] + R[3][5]*p_y[tid] + R[4][5]*t[tid] + R[5][5]*p_t[tid];
//Now bring the values from our temporary variables back into the ones used in the original kernel
x[tid] = X;
p_x[tid] = P_X;
y[tid] = Y;
p_y[tid] = P_Y;
t[tid] = T;
p_t[tid] = P_T;
}
Any suggestions would be greatly appreciated, many thanks
Forgot to mention, also I used CUDA_VS_Wizard_W64.2.2.beta2 by kyzhao and friends to build projects