CUDA Unknown error cuda.net : transfering the data back to host problem

Aram · June 8, 2011, 11:13am

Hi.

I am working on a research project and i’m trying to port our cpu code to cuda capable parallel code.

i’m using the cuda.net wrapper for the host part on c#.

At first I convert a 2d array to 1d and send it to gpu.

Then I lunch the kernel

After that I transfer the results back to host memory.

If you look at my host code, the line that is numbered is where I face the problem. I have two scenarios:

1- with my GTX 285 I compile the cuda 3.2 code for sm_13 architecture

I get no error messages, but my host memory variable “tempEM_UNITmean_av” array results in 0. I mean all elements of the array is value 0.

2- with my GTX 570 I compile the same cuda 3.2 code for sm_20 architecture and I receive an exception saying “CUDA Unknown error”.

here’s my HOST code:

// __global__ void kmean

	// (int frameCount

	//	,int M

	//	, double* MFCC_UNIT_parm_t_by_l

	//	, double* EM_UNIT_mean_k_by_l

	//	,double* EM_UNIT_label_t

	//	,double* EM_UNIT_mean_av_M_by_l)

	double[] tempMfccUnitParm = new double[MFCC_UNIT.parm.Length * MFCC_UNIT.parm[0].Length];

	double[] tempEM_UNITmean = new double[EM_UNIT.mean.Length * EM_UNIT.mean[0].Length];

	double[] tempEM_UNITmean_av = new double[EM_UNIT.mean_av.Length * EM_UNIT.mean_av[0].Length];

	#region 2d array to 1d

	int rows = MFCC_UNIT.frame_count;

	int columns = MFCC_UNIT.parm[0].Length;

	for (int x = 0; x < rows; x++)

for (int y = 0; y < columns; y++)

{

	tempMfccUnitParm[x * columns + y] = MFCC_UNIT.parm[x][y];

}

	rows = EM_UNIT.mean.Length;

	columns = EM_UNIT.mean[0].Length;

	for (int x = 0; x < rows; x++)

for (int y = 0; y < columns; y++)

{

	tempEM_UNITmean[x * columns + y] = EM_UNIT.mean[x][y];

}

	rows = EM_UNIT.mean_av.Length;

	columns = EM_UNIT.mean_av[0].Length;

	for (int x = 0; x < rows; x++)

for (int y = 0; y < columns; y++)

{

	tempEM_UNITmean_av[x * columns + y] = EM_UNIT.mean_av[x][y];

}

	#endregion

	CUdeviceptr MFCCunit_ParmONDEVICE = cudaKernel.CopyHostToDevice<double>(tempMfccUnitParm);

	CUdeviceptr EMunit_meanONDEVICE = cudaKernel.CopyHostToDevice<double>(tempEM_UNITmean);

	CUdeviceptr EMunit_labelONDEVICE = cudaKernel.CopyHostToDevice<double>(EM_UNIT.label);

	CUdeviceptr EMunit_mean_avONDEVICE = cudaKernel.CopyHostToDevice<double>(tempEM_UNITmean_av);

	cudaKernel.SetParameter(CUDAkmean, 0, (uint)MFCC_UNIT.frame_count);

	cudaKernel.SetParameter(CUDAkmean, sizeof(int), (uint)M);

	cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2, (uint)MFCCunit_ParmONDEVICE.Pointer);

	cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2 + IntPtr.Size, (uint)EMunit_meanONDEVICE.Pointer);

	cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2 + IntPtr.Size * 2, (uint)EMunit_labelONDEVICE.Pointer);

	cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2 + IntPtr.Size * 3, (uint)EMunit_mean_avONDEVICE.Pointer);

	cudaKernel.SetParameterSize(CUDAkmean, (uint)(sizeof(int) * 2 + IntPtr.Size * 3));

	cudaKernel.SetFunctionBlockShape(CUDAkmean, BLOCKSIZEx, BLOCKSIZEy, 1);

	cudaKernel.Launch(CUDAkmean, 1 + MFCC_UNIT.frame_count / BLOCKSIZEx, 1);

	cudaKernel.CopyDeviceToHost(EMunit_labelONDEVICE, EM_UNIT.label);

/////////////////////////////

//here's my problem line

// 1

     cudaKernel.CopyDeviceToHost(EMunit_mean_avONDEVICE, tempEM_UNITmean_av); 

//

///////////////////////////

	cudaKernel.Free(MFCCunit_ParmONDEVICE);

	cudaKernel.Free(EMunit_meanONDEVICE);

	cudaKernel.Free(EMunit_labelONDEVICE);

	cudaKernel.Free(EMunit_mean_avONDEVICE);

Here’s my DEVICE code:

#include "math_constants.h"

#ifndef _GPUCODE_KERNEL_H_

#define _GPUCODE_KERNEL_H_

extern "C" __global__ void kmean(int frameCount,int M, double* MFCC_UNIT_parm_t_by_l, double* EM_UNIT_mean_k_by_l,double* EM_UNIT_label_t,double* EM_UNIT_mean_av_M_by_l)

{

	const unsigned int i = blockIdx.x * blockDim.x  + threadIdx.x;

	const unsigned int j = blockIdx.y * blockDim.y  + threadIdx.y;

	const unsigned int PARM_SIZE = 16;

	if ( i < frameCount && j==0 ) // 0 to t

	{

		 double min = 0;

		 double distance;

         int min_i = 0; //first mixture distance to frame t

		 double temp=0;

         for (int l = 0; l < PARM_SIZE; l++)

		 {

            temp = MFCC_UNIT_parm_t_by_l[i*PARM_SIZE+l]   - EM_UNIT_mean_k_by_l[0*PARM_SIZE+l];

            min += temp * temp; //k=0

         }

		 for (int k = 1; k < M; k++) //Number of Mixtures

         {

                distance = 0.0;

                for (int l = 0; l < PARM_SIZE; l++)

                {

                    temp = MFCC_UNIT_parm_t_by_l[i*PARM_SIZE+l]   - EM_UNIT_mean_k_by_l[k*PARM_SIZE+l];

                    distance += temp * temp;

                }

if (distance <= min)

                {

                    min = distance;

                    min_i = k;

                }

		} //for k

		EM_UNIT_label_t[i] = min_i; 

		for (int l = 0; l < PARM_SIZE; l++) //min_i ....index of minumum distance to mean

		EM_UNIT_mean_av_M_by_l[min_i*PARM_SIZE+l] += MFCC_UNIT_parm_t_by_l[i*PARM_SIZE+l]; 

	}

}

My systems are:

1-

Pentium D dual core

2 gigs of ddr 2 ram

XFX GTX 285

Windows 7

Cuda 3.2 installed

2-

Core i7 920

3 gigs of DDR3 ram corsair xmpp 1600mhz

EVGA GTX 570

Windows 7

Cuda 3.2 installed

I cannot figure out where I’m doing wrong.

I will be very grateful if you help me find the reason i get this strange situation.

Thank you.

Best Regards,

Aram Azhari

opensw · June 10, 2011, 2:40am

Hi.

I am working on a research project and i’m trying to port our cpu code to cuda capable parallel code.

i’m using the cuda.net wrapper for the host part on c#.

At first I convert a 2d array to 1d and send it to gpu.

Then I lunch the kernel

After that I transfer the results back to host memory.

If you look at my host code, the line that is numbered is where I face the problem. I have two scenarios:

1- with my GTX 285 I compile the cuda 3.2 code for sm_13 architecture

I get no error messages, but my host memory variable “tempEM_UNITmean_av” array results in 0. I mean all elements of the array is value 0.

2- with my GTX 570 I compile the same cuda 3.2 code for sm_20 architecture and I receive an exception saying “CUDA Unknown error”.

here’s my HOST code:

// __global__ void kmean

	// (int frameCount

	//	,int M

	//	, double* MFCC_UNIT_parm_t_by_l

	//	, double* EM_UNIT_mean_k_by_l

	//	,double* EM_UNIT_label_t

	//	,double* EM_UNIT_mean_av_M_by_l)

	double[] tempMfccUnitParm = new double[MFCC_UNIT.parm.Length * MFCC_UNIT.parm[0].Length];

	double[] tempEM_UNITmean = new double[EM_UNIT.mean.Length * EM_UNIT.mean[0].Length];

	double[] tempEM_UNITmean_av = new double[EM_UNIT.mean_av.Length * EM_UNIT.mean_av[0].Length];

	#region 2d array to 1d

	int rows = MFCC_UNIT.frame_count;

	int columns = MFCC_UNIT.parm[0].Length;

	for (int x = 0; x < rows; x++)

for (int y = 0; y < columns; y++)

{

	tempMfccUnitParm[x * columns + y] = MFCC_UNIT.parm[x][y];

}

	rows = EM_UNIT.mean.Length;

	columns = EM_UNIT.mean[0].Length;

	for (int x = 0; x < rows; x++)

for (int y = 0; y < columns; y++)

{

	tempEM_UNITmean[x * columns + y] = EM_UNIT.mean[x][y];

}

	rows = EM_UNIT.mean_av.Length;

	columns = EM_UNIT.mean_av[0].Length;

	for (int x = 0; x < rows; x++)

for (int y = 0; y < columns; y++)

{

	tempEM_UNITmean_av[x * columns + y] = EM_UNIT.mean_av[x][y];

}

	#endregion

	CUdeviceptr MFCCunit_ParmONDEVICE = cudaKernel.CopyHostToDevice<double>(tempMfccUnitParm);

	CUdeviceptr EMunit_meanONDEVICE = cudaKernel.CopyHostToDevice<double>(tempEM_UNITmean);

	CUdeviceptr EMunit_labelONDEVICE = cudaKernel.CopyHostToDevice<double>(EM_UNIT.label);

	CUdeviceptr EMunit_mean_avONDEVICE = cudaKernel.CopyHostToDevice<double>(tempEM_UNITmean_av);

	cudaKernel.SetParameter(CUDAkmean, 0, (uint)MFCC_UNIT.frame_count);

	cudaKernel.SetParameter(CUDAkmean, sizeof(int), (uint)M);

	cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2, (uint)MFCCunit_ParmONDEVICE.Pointer);

	cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2 + IntPtr.Size, (uint)EMunit_meanONDEVICE.Pointer);

	cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2 + IntPtr.Size * 2, (uint)EMunit_labelONDEVICE.Pointer);

	cudaKernel.SetParameter(CUDAkmean, sizeof(int) * 2 + IntPtr.Size * 3, (uint)EMunit_mean_avONDEVICE.Pointer);

	cudaKernel.SetParameterSize(CUDAkmean, (uint)(sizeof(int) * 2 + IntPtr.Size * 3));

	cudaKernel.SetFunctionBlockShape(CUDAkmean, BLOCKSIZEx, BLOCKSIZEy, 1);

	cudaKernel.Launch(CUDAkmean, 1 + MFCC_UNIT.frame_count / BLOCKSIZEx, 1);

	cudaKernel.CopyDeviceToHost(EMunit_labelONDEVICE, EM_UNIT.label);

/////////////////////////////

//here's my problem line

// 1

     cudaKernel.CopyDeviceToHost(EMunit_mean_avONDEVICE, tempEM_UNITmean_av); 

//

///////////////////////////

	cudaKernel.Free(MFCCunit_ParmONDEVICE);

	cudaKernel.Free(EMunit_meanONDEVICE);

	cudaKernel.Free(EMunit_labelONDEVICE);

	cudaKernel.Free(EMunit_mean_avONDEVICE);

Here’s my DEVICE code:

#include "math_constants.h"

#ifndef _GPUCODE_KERNEL_H_

#define _GPUCODE_KERNEL_H_

extern "C" __global__ void kmean(int frameCount,int M, double* MFCC_UNIT_parm_t_by_l, double* EM_UNIT_mean_k_by_l,double* EM_UNIT_label_t,double* EM_UNIT_mean_av_M_by_l)

{

	const unsigned int i = blockIdx.x * blockDim.x  + threadIdx.x;

	const unsigned int j = blockIdx.y * blockDim.y  + threadIdx.y;

	const unsigned int PARM_SIZE = 16;

	if ( i < frameCount && j==0 ) // 0 to t

	{

		 double min = 0;

		 double distance;

         int min_i = 0; //first mixture distance to frame t

		 double temp=0;

         for (int l = 0; l < PARM_SIZE; l++)

		 {

            temp = MFCC_UNIT_parm_t_by_l[i*PARM_SIZE+l]   - EM_UNIT_mean_k_by_l[0*PARM_SIZE+l];

            min += temp * temp; //k=0

         }

		 for (int k = 1; k < M; k++) //Number of Mixtures

         {

                distance = 0.0;

                for (int l = 0; l < PARM_SIZE; l++)

                {

                    temp = MFCC_UNIT_parm_t_by_l[i*PARM_SIZE+l]   - EM_UNIT_mean_k_by_l[k*PARM_SIZE+l];

                    distance += temp * temp;

                }

if (distance <= min)

                {

                    min = distance;

                    min_i = k;

                }

		} //for k

		EM_UNIT_label_t[i] = min_i; 

		for (int l = 0; l < PARM_SIZE; l++) //min_i ....index of minumum distance to mean

		EM_UNIT_mean_av_M_by_l[min_i*PARM_SIZE+l] += MFCC_UNIT_parm_t_by_l[i*PARM_SIZE+l]; 

	}

}

My systems are:

1-

Pentium D dual core

2 gigs of ddr 2 ram

XFX GTX 285

Windows 7

Cuda 3.2 installed

2-

Core i7 920

3 gigs of DDR3 ram corsair xmpp 1600mhz

EVGA GTX 570

Windows 7

Cuda 3.2 installed

I cannot figure out where I’m doing wrong.

I will be very grateful if you help me find the reason i get this strange situation.

Thank you.

Best Regards,

Aram Azhari

Hello there,

sorry but I have leave CUDA.NET wrapper Hoopoe a long time ago then I do not remember well how it works… I do not know what are you porting from CPU to GPU but if you are using .NET on host and if I can give you an advice try this: Our Products - Hybrid DSP it is more better than CUDA.NET of Hoopoe project ;-)… you can develop everything in C# (and CUDA kernel too!!) read this: Using Cudafy for GPGPU Programming in .NET - CodeProject

Cheers

Aram · June 10, 2011, 7:00am

Thank you. I already found that out just yesterday, the good thing about CUDAfyNET is that i can debug in Emulator mode without putting my gpus on the sme pc.

Thanks anyways.