Hi.
I am working on a research project and i’m trying to port our cpu code to cuda capable parallel code.
i’m using the cuda.net wrapper for the host part on c#.
At first I convert a 2d array to 1d and send it to gpu.
Then I lunch the kernel
After that I transfer the results back to host memory.
If you look at my host code, the line that is numbered is where I face the problem. I have two scenarios:
1- with my GTX 285 I compile the cuda 3.2 code for sm_13 architecture
I get no error messages, but my host memory variable “tempEM_UNITmean_av” array results in 0. I mean all elements of the array is value 0.
2- with my GTX 570 I compile the same cuda 3.2 code for sm_20 architecture and I receive an exception saying “CUDA Unknown error”.
here’s my HOST code:
// __global__ void kmean
// (int frameCount
// ,int M
// , double* MFCC_UNIT_parm_t_by_l
// , double* EM_UNIT_mean_k_by_l
// ,double* EM_UNIT_label_t
// ,double* EM_UNIT_mean_av_M_by_l)
double[] tempMfccUnitParm = new double[MFCC_UNIT.parm.Length * MFCC_UNIT.parm[0].Length];
double[] tempEM_UNITmean = new double[EM_UNIT.mean.Length * EM_UNIT.mean[0].Length];
double[] tempEM_UNITmean_av = new double[EM_UNIT.mean_av.Length * EM_UNIT.mean_av[0].Length];
#region 2d array to 1d
int rows = MFCC_UNIT.frame_count;
int columns = MFCC_UNIT.parm[0].Length;
for (int x = 0; x < rows; x++)
for (int y = 0; y < columns; y++)
{
tempMfccUnitParm[x * columns + y] = MFCC_UNIT.parm[x][y];
}
rows = EM_UNIT.mean.Length;
columns = EM_UNIT.mean[0].Length;
for (int x = 0; x < rows; x++)
for (int y = 0; y < columns; y++)
{
tempEM_UNITmean[x * columns + y] = EM_UNIT.mean[x][y];
}
rows = EM_UNIT.mean_av.Length;
columns = EM_UNIT.mean_av[0].Length;
for (int x = 0; x < rows; x++)
for (int y = 0; y < columns; y++)
{
tempEM_UNITmean_av[x * columns + y] = EM_UNIT.mean_av[x][y];
}
#endregion
CUdeviceptr MFCCunit_ParmONDEVICE = cudaKernel.CopyHostToDevice<double>(tempMfccUnitParm);
CUdeviceptr EMunit_meanONDEVICE = cudaKernel.CopyHostToDevice<double>(tempEM_UNITmean);
CUdeviceptr EMunit_labelONDEVICE = cudaKernel.CopyHostToDevice<double>(EM_UNIT.label);
CUdeviceptr EMunit_mean_avONDEVICE = cudaKernel.CopyHostToDevice<double>(tempEM_UNITmean_av);
cudaKernel.SetParameter(CUDAkmean, 0, (uint)MFCC_UNIT.frame_count);
cudaKernel.SetParameter(CUDAkmean, sizeof(int), (uint)M);
cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2, (uint)MFCCunit_ParmONDEVICE.Pointer);
cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2 + IntPtr.Size, (uint)EMunit_meanONDEVICE.Pointer);
cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2 + IntPtr.Size * 2, (uint)EMunit_labelONDEVICE.Pointer);
cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2 + IntPtr.Size * 3, (uint)EMunit_mean_avONDEVICE.Pointer);
cudaKernel.SetParameterSize(CUDAkmean, (uint)(sizeof(int) * 2 + IntPtr.Size * 3));
cudaKernel.SetFunctionBlockShape(CUDAkmean, BLOCKSIZEx, BLOCKSIZEy, 1);
cudaKernel.Launch(CUDAkmean, 1 + MFCC_UNIT.frame_count / BLOCKSIZEx, 1);
cudaKernel.CopyDeviceToHost(EMunit_labelONDEVICE, EM_UNIT.label);
/////////////////////////////
//here's my problem line
// 1
cudaKernel.CopyDeviceToHost(EMunit_mean_avONDEVICE, tempEM_UNITmean_av);
//
///////////////////////////
cudaKernel.Free(MFCCunit_ParmONDEVICE);
cudaKernel.Free(EMunit_meanONDEVICE);
cudaKernel.Free(EMunit_labelONDEVICE);
cudaKernel.Free(EMunit_mean_avONDEVICE);
Here’s my DEVICE code:
#include "math_constants.h"
#ifndef _GPUCODE_KERNEL_H_
#define _GPUCODE_KERNEL_H_
extern "C" __global__ void kmean(int frameCount,int M, double* MFCC_UNIT_parm_t_by_l, double* EM_UNIT_mean_k_by_l,double* EM_UNIT_label_t,double* EM_UNIT_mean_av_M_by_l)
{
const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
const unsigned int PARM_SIZE = 16;
if ( i < frameCount && j==0 ) // 0 to t
{
double min = 0;
double distance;
int min_i = 0; //first mixture distance to frame t
double temp=0;
for (int l = 0; l < PARM_SIZE; l++)
{
temp = MFCC_UNIT_parm_t_by_l[i*PARM_SIZE+l] - EM_UNIT_mean_k_by_l[0*PARM_SIZE+l];
min += temp * temp; //k=0
}
for (int k = 1; k < M; k++) //Number of Mixtures
{
distance = 0.0;
for (int l = 0; l < PARM_SIZE; l++)
{
temp = MFCC_UNIT_parm_t_by_l[i*PARM_SIZE+l] - EM_UNIT_mean_k_by_l[k*PARM_SIZE+l];
distance += temp * temp;
}
if (distance <= min)
{
min = distance;
min_i = k;
}
} //for k
EM_UNIT_label_t[i] = min_i;
for (int l = 0; l < PARM_SIZE; l++) //min_i ....index of minumum distance to mean
EM_UNIT_mean_av_M_by_l[min_i*PARM_SIZE+l] += MFCC_UNIT_parm_t_by_l[i*PARM_SIZE+l];
}
}
My systems are:
1-
Pentium D dual core
2 gigs of ddr 2 ram
XFX GTX 285
Windows 7
Cuda 3.2 installed
2-
Core i7 920
3 gigs of DDR3 ram corsair xmpp 1600mhz
EVGA GTX 570
Windows 7
Cuda 3.2 installed
I cannot figure out where I’m doing wrong.
I will be very grateful if you help me find the reason i get this strange situation.
Thank you.
Best Regards,
Aram Azhari