Bath SOM Parallel Algoritm for Matlab Src Matlab, Mex, C

If you need to Train large SOM, or you have a lot data Train, or you problem is high dimesional.

gpu_batchsom.cu

[font=“Arial”]//
// Parallel Batch Train SOM
// Parallel NumSOM
// Parallel SOMSize
// Parallel DataLen
// Parallel Dim
//
#include <stdlib.h>
#include <minmax.h>
#include “mex.h”
#include <float.h>

#include “cuda_runtime_api.h”
#include “cublas.h”
#include “matrix.h”
#include “cuda.h”

struct Bmus
{
float err;
int idNeuron;
int idData;
};
struct SParam
{
int DataLen;
int NumSOM;
int Dim;
int SizeMSOM;
int SizeNSOM;
int SizeSOM;

int NumThreadData;
int NumThreadDim;

int SizeBlockDim;
int ModBlockDim;

int SizeBlockData;
int ModBlockData;
};

device float *DeviceCodeBooks;
device float *DeviceData;
device float *DeviceAllBmus;
device int *DeviceBmus;
device float *DeviceQErr;
device float *DeviceUDist;
device float *DeviceTmpSum;
device float *DeviceTmpSum2;
device float *DeviceTmpSumDim;
device float *DeviceTmpSumGetBMUDim;
device float *DeviceTmpSumPr1Update1;
device float *DeviceTmpSumPr1Update2;
device int *DeviceSizePerDataBlocks;
device int *DeviceSizePerDimBlocks;

device SParam *DevParam;

float *HostCodeBooks;
float *HostAllBmus;
int *HostBmus;
float *HostQErr;
float *HostUDist;
float *HostTmpSum;
int *HostSizePerDataBlocks;
int *HostSizePerDimBlocks;

//float *STmpSum;

SParam *Param;

global void CalulateAllNeurons(float *CodeBooks, float *Data, float *allbmus, SParam Param)
{/
blockDim.x - Колличетсво SOM
blockId.x - Номер SOM
threadIdx.x - Номер нейрона
*/
const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;

for (int i = 0; iDataLen;i++)
{

 float SUM=0;
 float d=0;
  for (int k=0; k<Param->Dim;k++)
  {
   d=CodeBooks[(iSom*Param->Dim+k)*Param->SizeSOM+iNeuron]-Data[k*Param->DataLen+i];
   SUM=SUM+d*d;
  }

allbmus[(i*Param->NumSOM+iSom)*Param->SizeSOM+iNeuron]=sqrtf(SUM);

}

}
///////////////////////////////////////////////////////////////////////////
global void CalulateAllNeuronsDataParallel(float *CodeBooks, float *Data, float *allbmus, SParam *Param)
{ // blockDim.x - Колличетсво SOM Numbers of SOM
// blockId.x - Номер SOM Index of SOM
// threadIdx.x - Номер нейрона Index of neuron

const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
const int iData=threadIdx.y;

int dataindex;

// Параллельное вычисление по все векторам данных
for (int i = 0; iSizeBlockData; i++)
{
dataindex = Param->SizeBlockData*iData + i;
float SUM=0;
float d=0;

for (int k=0; kDim;k++)
{
d=CodeBooks[(iSomParam->Dim+k)Param->SizeSOM+iNeuron]-Data[kParam->DataLen+dataindex];
SUM=SUM+d
d;
}
allbmus[(dataindex*Param->NumSOM+iSom)*Param->SizeSOM+iNeuron]=sqrtf(SUM);

}
__syncthreads();

}
///////////////////////////////////////////////////////////////////////////
global void CalulateAllNeuronsDimDataParallel(float *CodeBooks, float *Data, float *allbmus,float *Stmp, SParam *Param )
{const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
const int iData=threadIdx.y;
const int iDim=threadIdx.z;
int dataindex;
int dimindex;

// Параллельное вычисление по все векторам данных
for (int i = 0; iSizeBlockData; i++)
{
dataindex = Param->SizeBlockDataiData + i;
float SubSUM=0;
float d=0;
// Параллельное вычисление по всей размерности данных
for (int k=0; kSizeBlockDim;k++)
{
dimindex= Param->SizeBlockDim
iDim+k;
d=CodeBooks[(iSomParam->Dim+dimindex)Param->SizeSOM+iNeuron]-Data[dimindexParam->DataLen+dataindex];
SubSUM=SubSUM+d
d;
}
Stmp[(iSomParam->NumThreadDim+iDim)Param->SizeSOMParam->NumThreadData+iNeuronParam->NumThreadData+iData]=SubSUM;
// Stmp[iNeuron]=SubSUM;
__syncthreads(); // Дождемся окончания всех потоков
float SUM;
SUM=0;
if (iDim==0) // Один из потоков берет на себя функцию суммирования
{ // Всех временных сумм
for (int k=0; kNumThreadDim;k++) // По всем временным суммам
{
SUM=SUM+ Stmp[(iSomParam->NumThreadDim+k)Param->SizeSOMParam->NumThreadData+iNeuronParam->NumThreadData+iData] ;
// SUM=SUM+ Stmp[iNeuron];
}
allbmus[(dataindex*Param->NumSOM+iSom)*Param->SizeSOM+iNeuron]=sqrtf(SUM);
}

}

}
///////////////////////////////////////////////////////////////////////////
global void GetBMU(float *CodeBooks,float *allbmus,int *bmus,float *QErr, SParam *Param)
{
//shared
float Min;
float tmpS;
const int iNeuron=threadIdx.x;
const int iSom=blockIdx.x;
if (iNeuron==0)
{
tmpS=0;

for (int i = 0; iDataLen;i++)
{
Min=3e+8;
for (int k=0;kSizeSOM;k++)
{

float d=allbmus[(i*Param->NumSOM+iSom)*Param->SizeSOM+k];
if (d<=Min){ Min=d; bmus[i*Param->NumSOM+iSom]=k+1;
}
}
tmpS=tmpS+Min;

}
  QErr[iSom]=tmpS/Param->DataLen;

}

}
///////////////////////////////////////////////////////////////////////////
global void GetBMUDataParallel(float *CodeBooks,float *allbmus,int *bmus,float *QErr,float *TmpSum3, SParam *Param)
{
//shared
float Min;
float tmpS;
const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
const int iData=threadIdx.y;

  tmpS=0;

for (int i = 0; iSizeBlockData;i++) // паралельность по данным
{
Min=3e+8;
for (int k=0;kSizeSOM;k++)
{

float d=allbmus[((iData*Param->SizeBlockData+i)*Param->NumSOM+iSom)Param->SizeSOM+k];
if (d<=Min){ Min=d; bmus[(iData
Param->SizeBlockData+i)*Param->NumSOM+iSom]=k+1; }

}
   tmpS=tmpS+Min;

}
  TmpSum3[iData*Param->NumSOM+iSom]=tmpS/Param->SizeBlockData;
 __syncthreads(); // wait all threads 
 
if (iData==0)  
 {
     tmpS=0;
 for (int i = 0; i < Param->NumThreadData; i++)
     tmpS=tmpS+TmpSum3[i*Param->NumSOM+iSom];
  QErr[iSom]=tmpS/Param->NumThreadData;
}

}
//////////////////////////////////////////////////////////////////////////
device float funh(float radius,float *Dists,int x,int y,SParam *Param )
{

return expf(-Dists[Param->SizeSOM*x+y]/(2*radius));

}
global void UpdateNurons(float *CodeBooks,float *Data,int *bmus,float *QErr,float *UDist,float *TmpSum,float *TmpSum2, SParam *Param,float radius)
{
const int iNeuron=threadIdx.x;
const int iSom=blockIdx.x;
float sumz;
float sumc;
sumc=0;
sumz=0;

for (int i=0;iDataLen;i++)
{
if (i==0) {

    for (int k=0;k<Param->Dim;k++)
  {
  int index=(iSom*Param->Dim+k)*Param->SizeSOM+iNeuron;
    TmpSum[index]=0;
    TmpSum2[index]=0;
   }
    }
if (iNeuron==bmus[i*Param->NumSOM+iSom]-1)
{   
  for (int k=0;k<Param->Dim;k++)
  {
  int index=(iSom*Param->Dim+k)*Param->SizeSOM+iNeuron;
  TmpSum[index]=TmpSum[index]+Data[k*Param->DataLen+i];
  TmpSum2[index]++;
  
  }
}

}

for (int i=0;iDim;i++)
{
sumc=0;
sumz=0;
for (int k=0;kSizeSOM;k++)
{
int index=(iSom*Param->Dim+i)*Param->SizeSOM+k;
//////////////////////////////////////////////////////////////////////////
//sumc=TmpSum[index];
sumc=sumc+funh(radius,UDist,iNeuron,k,Param)*TmpSum[index];
sumz=sumz+funh(radius,UDist,iNeuron,k,Param)*TmpSum2[index];
/////////////////////////////////////////////////////////////////////////

}

if (sumz>0) CodeBooks[(iSom*Param->Dim+i)*Param->SizeSOM+iNeuron]=sumc/sumz;
}
}
///////////////////////////////////////////////////////////////////////////
global void UpdateNuronsDataParallel(float *CodeBooks,float *Data,int *bmus,float *QErr,float *UDist,float *TmpSum,float *TmpSum2, SParam *Param,float radius)
{
const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
const int iData=threadIdx.y;

float sumz;
float sumc;
sumc=0;
sumz=0;

for (int i=0;iSizeBlockData;i++)
{
if (i==0)
{
// U[iData]=0;
for (int k=0;kDim;k++)
{
//int index=(iSomParam->Dim+k)Param->SizeSOM+iNeuron;
int index =(iSom
Param->Dim+k)Param->NumThreadDataParam->SizeSOM+Param->SizeSOM
iData+iNeuron;
TmpSum[index]=0;
TmpSum2[index]=0;
}
}

//   if (iNeuron==bmus[i*Param->NumSOM+iSom]-1)

if (iNeuron==bmus[(iDataParam->SizeBlockData+i)Param->NumSOM+iSom]-1)
{
for (int k=0;kDim;k++)
{
// U[iData]=11;
// int index=(iSom
Param->Dim+k)Param->SizeSOM+iNeuron;
// k
Param->DataLen+i
int index = (iSom
Param->Dim+k)Param->NumThreadDataParam->SizeSOM+Param->SizeSOMiData+iNeuron;
int dataindex = k
Param->DataLen+Param->SizeBlockData*iData + i;

  TmpSum[index]=TmpSum[index]+Data[dataindex];
  TmpSum2[index]++;
  }
}

}
__syncthreads();

if (iData==0)
{
for (int k=0; kDim;k++)
{

 for (int i=1; i < Param->NumThreadData; i++)
  {
       
   int indexsave =      (iSom*Param->Dim+k)*Param->NumThreadData*Param->SizeSOM+Param->SizeSOM*iData+iNeuron; 
   int indexwhatsave  = (iSom*Param->Dim+k)*Param->NumThreadData*Param->SizeSOM+Param->SizeSOM*i+iNeuron; 
   TmpSum[indexsave]=   TmpSum[indexsave]+TmpSum[indexwhatsave];
   TmpSum2[indexsave]=  TmpSum2[indexsave]+TmpSum2[indexwhatsave];
  }
 }

for (int i=0;iDim;i++)
{
sumc=0;
sumz=0;
for (int k=0;kSizeSOM;k++)
{
// int index=(iSomParam->Dim+i)Param->SizeSOM+k;
int index = (iSom
Param->Dim+i)Param->NumThreadDataParam->SizeSOM+Param->SizeSOM
iData+k;
//////////////////////////////////////////////////////////////////////////
//sumc=TmpSum[index];
sumc=sumc+funh(radius,UDist,iNeuron,k,Param)*TmpSum[index];
sumz=sumz+funh(radius,UDist,iNeuron,k,Param)*TmpSum2[index];
/////////////////////////////////////////////////////////////////////////

}

if (sumz>0) CodeBooks[(iSom*Param->Dim+i)*Param->SizeSOM+iNeuron]=sumc/sumz;
}
}
}

///////////////////////////////////////////////////////////////////////////
global void UpdateNuronsDimDataParallel(float *CodeBooks,float *Data,int *bmus,float *QErr,float *UDist,float *TmpSum,float *TmpSum2, SParam *Param,float radius)
{
const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
const int iData=threadIdx.y;
const int iDim=threadIdx.z;

float sumz;
float sumc;
sumc=0;
sumz=0;

for (int i=0;iSizeBlockData;i++)
{
if (i==0)
{
// U[iData]=0;
for (int k=0;kSizeBlockDim;k++)
{
//int index=(iSomParam->Dim+k)Param->SizeSOM+iNeuron;
//int index =(iSom
Param->Dim+k)Param->NumThreadDataParam->SizeSOM+Param->SizeSOM
iData+iNeuron;
int index =(iSomParam->Dim+iDimParam->SizeBlockDim+k)Param->NumThreadDataParam->SizeSOM+Param->SizeSOM*iData+iNeuron;
TmpSum[index]=0;
TmpSum2[index]=0;
}
}

//   if (iNeuron==bmus[i*Param->NumSOM+iSom]-1)

if (iNeuron==bmus[(iDataParam->SizeBlockData+i)Param->NumSOM+iSom]-1)
{
for (int k=0;kSizeBlockDim;k++)
{
// U[iData]=11;
// int index=(iSom
Param->Dim+k)Param->SizeSOM+iNeuron;
// k
Param->DataLen+i
int index = (iSom
Param->Dim+iDimParam->SizeBlockDim+k)Param->NumThreadDataParam->SizeSOM+Param->SizeSOMiData+iNeuron;
int dataindex = (iDim*Param->SizeBlockDim+k)Param->DataLen+Param->SizeBlockDataiData + i;

  TmpSum[index]=TmpSum[index]+Data[dataindex];
  TmpSum2[index]++;
  }
}

}
__syncthreads();

if (iData==0)
{
for (int k=0;kSizeBlockDim;k++)
{
for (int i=1; i < Param->NumThreadData; i++)
{

   int indexsave =      (iSom*Param->Dim+iDim*Param->SizeBlockDim+k)*Param->NumThreadData*Param->SizeSOM+Param->SizeSOM*iData+iNeuron; 
   int indexwhatsave  = (iSom*Param->Dim+iDim*Param->SizeBlockDim+k)*Param->NumThreadData*Param->SizeSOM+Param->SizeSOM*i+iNeuron; 
   TmpSum[indexsave]=   TmpSum[indexsave]+TmpSum[indexwhatsave];
   TmpSum2[indexsave]=  TmpSum2[indexsave]+TmpSum2[indexwhatsave];
  }
 }

__syncthreads();
for (int i=0;iSizeBlockDim;i++)
{
sumc=0;
sumz=0;
for (int k=0;kSizeSOM;k++)
{
// int index=(iSomParam->Dim+i)Param->SizeSOM+k;
int index = (iSom
Param->Dim+iDim
Param->SizeBlockDim+i)Param->NumThreadDataParam->SizeSOM+Param->SizeSOM*iData+k;
//////////////////////////////////////////////////////////////////////////
//sumc=TmpSum[index];
sumc=sumc+funh(radius,UDist,iNeuron,k,Param)*TmpSum[index];
sumz=sumz+funh(radius,UDist,iNeuron,k,Param)*TmpSum2[index];
/////////////////////////////////////////////////////////////////////////

}

if (sumz>0) CodeBooks[(iSomParam->Dim+iDimParam->SizeBlockDim+i)*Param->SizeSOM+iNeuron]=sumc/sumz;
}
}
}
///////////////////////////////////////////////////////////////////////////
void mexFunction( int nlhs, mxArray *plhs,
int nrhs, const mxArray *prhs)
{

float *radius;

float StartRadius;
float EndRadius;
int trainlen;
Param = (SParam*) mxCalloc( 1,sizeof(SParam) );

Param->NumSOM = ((int *)mxGetData(prhs[1]))[0];
Param->SizeMSOM = ((int *)mxGetData(prhs[1]))[1];
Param->SizeNSOM = ((int )mxGetData(prhs[1]))[2];
Param->SizeSOM=Param->SizeMSOM
Param->SizeNSOM;
Param->Dim = mxGetN(prhs[2]);
Param->DataLen = mxGetM(prhs[2]);

StartRadius = ((float *)mxGetData(prhs[4]))[0];
EndRadius = ((float *)mxGetData(prhs[4]))[1];
trainlen = ((int *)mxGetData(prhs[5]))[0];

Param->NumThreadDim = int(((int *)mxGetData(prhs[6]))[1]);
Param->NumThreadData = int(((int *)mxGetData(prhs[6]))[0]);
Param->SizeBlockData = Param->DataLen / Param->NumThreadData;
Param->SizeBlockDim = Param->Dim / Param->NumThreadDim;

radius = (float*) mxCalloc( trainlen ,sizeof(float) );

float h=(StartRadius-EndRadius)/(trainlen-1);

for (int i=0;i<trainlen;i++)
{
radius[i]=(StartRadius-ih)(StartRadius-i*h);
if (radius[i]==0) radius[i]=1e-10;
}

cuInit(0);

//int z=cuDeviceTotalMem();
int cdev[10];
int devcount;
char nameDevice[30];
unsigned int TotalRAM[10];

CUdevice dev;
CUdevprop prop;

cuDeviceGetCount(cdev);
cuDeviceGet(&dev,0);
cuDeviceGetName(nameDevice,30,dev);
cuDeviceTotalMem(TotalRAM,dev);
cuDeviceGetProperties(&prop,dev);
mexPrintf(“Device Info \n”);
mexPrintf(“Device count = %i \n”,cdev[0]);
mexPrintf(“Device name = %s \n”,nameDevice);
mexPrintf(“Total RAM = %i Mb \n”,1+TotalRAM[0]/1024/1024);
mexPrintf(“MaxThreadsPerBlock %i\n”,prop.maxThreadsPerBlock);
mexPrintf(“MaxThreadsDim( X =%i, Y =%i , Z =%i ) \n”,prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2]);
mexPrintf(“MaxGridSize( X =%i, Y =%i , Z =%i ) \n”,prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);
mexPrintf(“WarpSize %i\n”,prop.SIMDWidth);
mexPrintf(“Clock Rate %i Mhz \n”, prop.clockRate/1000);
mexPrintf(" ---------------------------------------------- \n");
cudaError_t err=cudaGetLastError();
if (err!=0)
{
printf(“CUDA ERROR %s”,cudaGetErrorString(err));
}

HostAllBmus = (float*) mxCalloc( Param->DataLenParam->NumSOMParam->SizeMSOMParam->SizeNSOM,sizeof(float) );
HostBmus = (int
) mxCalloc( Param->DataLenParam->NumSOM,sizeof(int) );
HostCodeBooks = (float
) mxCalloc( Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dim,sizeof(float) );
HostQErr = (float
) mxCalloc( Param->NumSOM,sizeof(float) );
HostUDist = (float*) mxCalloc( Param->SizeSOM,sizeof(float) );
HostTmpSum = (float*) mxCalloc( Param->NumSOMParam->SizeMSOMParam->SizeNSOM*Param->Dim,sizeof(float) );

cudaMalloc((void**) &DeviceCodeBooks, Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float));
cudaMemcpy(DeviceCodeBooks,(float )mxGetData(prhs[0]), Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float),cudaMemcpyHostToDevice);
cudaMalloc((void**) &DeviceData, Param->Dim
Param->DataLensizeof(float));
cudaMemcpy(DeviceData,(float )mxGetData(prhs[2]), Param->DimParam->DataLen
sizeof(float),cudaMemcpyHostToDevice);
cudaMalloc((void**) &DeviceAllBmus, Param->DataLenParam->NumSOMParam->SizeMSOMParam->SizeNSOM sizeof(float));
cudaMalloc((void**) &DeviceBmus, Param->DataLenParam->NumSOM sizeof(int));
cudaMalloc((void**) &DeviceQErr, Param->NumSOM* sizeof(float));
cudaMalloc((void**) &DeviceUDist, Param->SizeSOMParam->SizeSOM sizeof(float));
cudaMalloc((void
*) &DeviceTmpSum, Param->NumSOM
Param->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float));
cudaMalloc((void**) &DeviceTmpSum2, Param->NumSOM
Param->SizeMSOMParam->SizeNSOMParam->Dim*sizeof(float));

// Additional for CalculateDimData
cudaMalloc((void**) &DeviceTmpSumDim, Param->NumSOMParam->SizeSOMParam->NumThreadDimParam->NumThreadDatasizeof(float));
// for GetBMU
cudaMalloc((void**) &DeviceTmpSumGetBMUDim, Param->NumSOMParam->NumThreadDatasizeof(float));
// for Update
cudaMalloc((void**) &DeviceTmpSumPr1Update1, Param->NumThreadDataParam->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float));
cudaMalloc((void**) &DeviceTmpSumPr1Update2, Param->NumThreadData
Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float));

cudaMemcpy(DeviceUDist,(float )mxGetData(prhs[3]), Param->SizeSOMParam->SizeSOM*sizeof(float),cudaMemcpyHostToDevice);

mexPrintf(“DataLen ( Size of Data ) %i\n”,Param->DataLen);
mexPrintf(“DataDim ( Dimensional Data ) %i\n”,Param->Dim);
mexPrintf(“SOMN%i , SOMM %i ,SizeSOM %i (Size of SOM) \n”,Param->SizeNSOM,Param->SizeMSOM, Param->SizeSOM);
mexPrintf(“NumSOM Parallel %i ( Number of parallel SOM \n”, Param->NumSOM);
mexPrintf(“ThreadData %i (Number of threads for DataLen ) \n”,Param->NumThreadData);
mexPrintf(“ThreadDim %i ( Number of threads for dimensional ) \n”,Param->NumThreadDim);
mexPrintf(“SizeBlockData %i ( Number of blocks for Data) \n”,Param->SizeBlockData);
mexPrintf(“SizeBlockDim %i ( Number of blocks for Dimensional) \n”,Param->SizeBlockDim);

//for (i=0;iNumThreadData;i++)

/* device int *DeviceSizePerDataBlocks;
device int *DeviceSizePerDimBlocks;
*/

// Copy param to device
cudaMalloc((void**) &DevParam, sizeof(SParam));
cudaMemcpy(DevParam,Param, sizeof(SParam),cudaMemcpyHostToDevice);

dim3 threads3(Param->SizeSOM,Param->NumThreadData,Param->NumThreadDim);
dim3 grid3(Param->NumSOM);

dim3 threads2(Param->SizeSOM,Param->NumThreadData);
dim3 grid2(Param->NumSOM);

dim3 threads1(Param->SizeSOM);
dim3 grid1(Param->NumSOM);

//// PARALLELLLLLL PROCESSSS !!!

for (int t=0;t<trainlen;t++)
{
//CalulateAllNeurons <<< grid1,threads1 >>> (DeviceCodeBooks, DeviceData, DeviceAllBmus, DevParam);
//CalulateAllNeuronsDataParallel <<< grid2,threads2 >>> (DeviceCodeBooks, DeviceData, DeviceAllBmus, DevParam);
CalulateAllNeuronsDimDataParallel <<< grid3,threads3 >>> (DeviceCodeBooks, DeviceData, DeviceAllBmus,DeviceTmpSumDim, DevParam);
//----------------------------------------------------------------------------------------------//
// GetBMU <<< grid1, threads1 >>> (DeviceCodeBooks, DeviceAllBmus, DeviceBmus,DeviceQErr, DevParam);
GetBMUDataParallel <<<grid2,threads2 >>>(DeviceCodeBooks, DeviceAllBmus, DeviceBmus,DeviceQErr,DeviceTmpSumGetBMUDim, DevParam);
//----------------------------------------------------------------------------------------------//
//printf(".\n");
//UpdateNurons<<< grid1, threads1 >>> (DeviceCodeBooks,DeviceData,DeviceBmus,DeviceQErr,DeviceUDist,DeviceTmpSum,DeviceTmpSum2,DevParam,radius[t]);
//UpdateNuronsDataParallel<<< grid2, threads2 >>> (DeviceCodeBooks,DeviceData,DeviceBmus,DeviceQErr,DeviceUDist,DeviceTmpSumPr1Update1,DeviceTmpSumPr1Update2,DevParam,radius[t]);
UpdateNuronsDimDataParallel<<< grid3, threads3 >>> (DeviceCodeBooks,DeviceData,DeviceBmus,DeviceQErr,DeviceUDist,DeviceTmpSumPr1Update1,DeviceTmpSumPr1Update2,DevParam,radius[t]);
//----------------------------------------------------------------------------------------------//
//cudaMemcpy(HostQErr,DeviceQErr, Param->NumSOM* sizeof(float),cudaMemcpyDeviceToHost);

}
///////////////////////////////////////////////////////////////////////////

cudaMemcpy(HostCodeBooks,DeviceCodeBooks, Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float),cudaMemcpyDeviceToHost);

plhs[0]=mxCreateNumericMatrix(Param->SizeMSOMParam->SizeNSOM,Param->NumSOMParam->Dim,mxSINGLE_CLASS,0);
mxSetData(plhs[0],HostCodeBooks);

//mxFree(HostCodeBooks);
cudaFree(DeviceAllBmus);
cudaFree(DeviceBmus);
cudaFree(DeviceQErr);
cudaFree(DeviceUDist);
cudaFree(DeviceTmpSum);
cudaFree(DeviceTmpSum2);
cudaFree(DeviceData);
cudaFree(DeviceCodeBooks);

err=cudaGetLastError();
if (err!=0)
{
printf(“CUDA ERROR %s”,cudaGetErrorString(err));
}

}[/font]

Compliation nvmexopts.bat

[font=“Arial”]@echo off
rem MSVC80OPTS.BAT
rem
rem Compile and link options used for building MEX-files
rem using the Microsoft Visual C++ compiler version 8.0
rem
rem Revision: 1.1.10.2 Date: 2006/06/23 19:04:53
rem
rem ********************************************************************
rem General parameters
rem ********************************************************************

set MATLAB=%MATLAB%
set VS80COMNTOOLS=%VS80COMNTOOLS%
set VSINSTALLDIR=C:\Program Files\Microsoft Visual Studio 8
set VCINSTALLDIR=%VSINSTALLDIR%\VC
set PATH=%VCINSTALLDIR%\BIN;%VCINSTALLDIR%\PlatformSDK\bin;%VSINSTALLDIR%\Common7\IDE;%VSINSTALLDIR%\SDK\v2.0\bin;%VSINSTALLDIR%\Common7\Tools;%VSINSTALLDIR%\Common7\Tools\bin;%VCINSTALLDIR%\VCPackages;%MATLAB_BIN%;%PATH%
set INCLUDE=%VCINSTALLDIR%\ATLMFC\INCLUDE;%VCINSTALLDIR%\INCLUDE;%VCINSTALLDIR%\PlatformSDK\INCLUDE;%VSINSTALLDIR%\SDK\v2.0\include;%INCLUDE%
set LIB=%VCINSTALLDIR%\ATLMFC\LIB;%VCINSTALLDIR%\LIB;%VCINSTALLDIR%\PlatformSDK\lib;%VSINSTALLDIR%\SDK\v2.0\lib;%MATLAB%\extern\lib\win32;%LIB%
set MW_TARGET_ARCH=win32

rem ********************************************************************
rem Compiler parameters
rem ********************************************************************
set COMPILER=nvcc
set COMPFLAGS=-c -D_WCHAR_T_DEFINED -Xcompiler “/c /Zp8 /GR /W3 /EHsc- /Zc:wchar_t- /DMATLAB_MEX_FILE /nologo”
set OPTIMFLAGS=-D_WCHAR_T_DEFINED -Xcompiler “/MD /O2 /Oy- /DNDEBUG”
set DEBUGFLAGS=-D_WCHAR_T_DEFINED -Xcompiler “/MD /Zi /Fd”%OUTDIR%%MEX_NAME%%MEX_EXT%.pdb"
set NAME_OBJECT=-o

rem ********************************************************************
rem Linker parameters
rem ********************************************************************
set LIBLOC=%MATLAB%\extern\lib\win32\microsoft
set LINKER=link
rem set LINKER=nvcc
set LINKFLAGS=/dll /export:%ENTRYPOINT% /MAP /LIBPATH:"%LIBLOC%" libmx.lib libmex.lib libmat.lib /implib:%LIB_NAME%.x /MACHINE:X86 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib
set LINKOPTIMFLAGS=
set LINKDEBUGFLAGS=/DEBUG /PDB:"%OUTDIR%%MEX_NAME%%MEX_EXT%.pdb"
set LINK_FILE=
set LINK_LIB=
set NAME_OUTPUT=/out:"%OUTDIR%%MEX_NAME%%MEX_EXT%"
set RSP_FILE_INDICATOR=@

rem ********************************************************************
rem Resource compiler parameters
rem ********************************************************************
set RC_COMPILER=rc /fo “%OUTDIR%mexversion.res”
set RC_LINKER=

set POSTLINK_CMDS=del “%OUTDIR%%MEX_NAME%.map”
set POSTLINK_CMDS1=del %LIB_NAME%.x
set POSTLINK_CMDS2=mt -outputresource:"%OUTDIR%%MEX_NAME%%MEX_EXT%";2 -manifest “%OUTDIR%%MEX_NAME%%MEX_EXT%.manifest”
set POSTLINK_CMDS3=del “%OUTDIR%%MEX_NAME%%MEX_EXT%.manifest”
[/font]

comp.m
[font=“Times”]nvmex -f nvmexopts.bat gpu_batchsom.cu -IC:\cuda\include -LC:\cuda\lib -lcudart -lcublas -lcuda [/font]

CheckSOM.m

[font=“Arial”]Y=;

Dim=3;

one=[ones(1,Dim)];
for i=1:1:200
p1=normrnd(one,0.5one);
p2=normrnd(5
one,0.5*one);
Y=[Y;p1];
Y=[Y;p2];
end

Dim=size(Y,2);
%disp(’--------------------------------------------------’);
msize=[5 6];

sTopol.msize = msize;
sTopol.type = ‘som_topol’;
sTopol.lattice = ‘rect’;
sTopol.shape = ‘sheet’;

%Codebooks=[sMap1.codebook];
sTmp = som_randinit(Y, ‘msize’, msize,‘sheet’,‘rect’ );
sMap1 = sTmp;
Codebooks=[sMap1.codebook];

Ud = som_unit_dists(sTopol);
Ud = Ud.^2;

GPUCodebooks=gpu_batchsom(single(Codebooks),int32([1 5 6]),single(Y),single(Ud),single([9 0]),int32(20),int32([1 1 ]));

sMap1=som_batchtrain(sMap1,Y,‘radius_ini’,9 ,‘radius_fin’,0,‘trainlen’,20);

disp(‘SOM1’);
som_quality(sMap1,Y)
%sMap1.codebook

Dim
sTmp.codebook=GPUCodebooks(:,1:Dim);

som_quality(sTmp,Y)

[/font]

fig 1. batchSOM.jpg time diagram

left one Train SOM on CPU right on GPU 9600GT 10 SOM
batchSOM.JPG