If you need to Train large SOM, or you have a lot data Train, or you problem is high dimesional.
gpu_batchsom.cu
[font=“Arial”]//
// Parallel Batch Train SOM
// Parallel NumSOM
// Parallel SOMSize
// Parallel DataLen
// Parallel Dim
//
#include <stdlib.h>
#include <minmax.h>
#include “mex.h”
#include <float.h>
#include “cuda_runtime_api.h”
#include “cublas.h”
#include “matrix.h”
#include “cuda.h”
struct Bmus
{
float err;
int idNeuron;
int idData;
};
struct SParam
{
int DataLen;
int NumSOM;
int Dim;
int SizeMSOM;
int SizeNSOM;
int SizeSOM;
int NumThreadData;
int NumThreadDim;
int SizeBlockDim;
int ModBlockDim;
int SizeBlockData;
int ModBlockData;
};
device float *DeviceCodeBooks;
device float *DeviceData;
device float *DeviceAllBmus;
device int *DeviceBmus;
device float *DeviceQErr;
device float *DeviceUDist;
device float *DeviceTmpSum;
device float *DeviceTmpSum2;
device float *DeviceTmpSumDim;
device float *DeviceTmpSumGetBMUDim;
device float *DeviceTmpSumPr1Update1;
device float *DeviceTmpSumPr1Update2;
device int *DeviceSizePerDataBlocks;
device int *DeviceSizePerDimBlocks;
device SParam *DevParam;
float *HostCodeBooks;
float *HostAllBmus;
int *HostBmus;
float *HostQErr;
float *HostUDist;
float *HostTmpSum;
int *HostSizePerDataBlocks;
int *HostSizePerDimBlocks;
//float *STmpSum;
SParam *Param;
global void CalulateAllNeurons(float *CodeBooks, float *Data, float *allbmus, SParam Param)
{/ blockDim.x - КолличетÑво SOM
blockId.x - Ðомер SOM
threadIdx.x - Ðомер нейрона
*/
const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
for (int i = 0; iDataLen;i++)
{
float SUM=0;
float d=0;
for (int k=0; k<Param->Dim;k++)
{
d=CodeBooks[(iSom*Param->Dim+k)*Param->SizeSOM+iNeuron]-Data[k*Param->DataLen+i];
SUM=SUM+d*d;
}
allbmus[(i*Param->NumSOM+iSom)*Param->SizeSOM+iNeuron]=sqrtf(SUM);
}
}
///////////////////////////////////////////////////////////////////////////
global void CalulateAllNeuronsDataParallel(float *CodeBooks, float *Data, float *allbmus, SParam *Param)
{ // blockDim.x - КолличетÑво SOM Numbers of SOM
// blockId.x - Ðомер SOM Index of SOM
// threadIdx.x - Ðомер нейрона Index of neuron
const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
const int iData=threadIdx.y;
int dataindex;
// Параллельное вычиÑление по вÑе векторам данных
for (int i = 0; iSizeBlockData; i++)
{
dataindex = Param->SizeBlockData*iData + i;
float SUM=0;
float d=0;
for (int k=0; kDim;k++)
{
d=CodeBooks[(iSomParam->Dim+k)Param->SizeSOM+iNeuron]-Data[kParam->DataLen+dataindex];
SUM=SUM+dd;
}
allbmus[(dataindex*Param->NumSOM+iSom)*Param->SizeSOM+iNeuron]=sqrtf(SUM);
}
__syncthreads();
}
///////////////////////////////////////////////////////////////////////////
global void CalulateAllNeuronsDimDataParallel(float *CodeBooks, float *Data, float *allbmus,float *Stmp, SParam *Param )
{const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
const int iData=threadIdx.y;
const int iDim=threadIdx.z;
int dataindex;
int dimindex;
// Параллельное вычиÑление по вÑе векторам данных
for (int i = 0; iSizeBlockData; i++)
{
dataindex = Param->SizeBlockDataiData + i;
float SubSUM=0;
float d=0;
// Параллельное вычиÑление по вÑей размерноÑти данных
for (int k=0; kSizeBlockDim;k++)
{
dimindex= Param->SizeBlockDimiDim+k;
d=CodeBooks[(iSomParam->Dim+dimindex)Param->SizeSOM+iNeuron]-Data[dimindexParam->DataLen+dataindex];
SubSUM=SubSUM+dd;
}
Stmp[(iSomParam->NumThreadDim+iDim)Param->SizeSOMParam->NumThreadData+iNeuronParam->NumThreadData+iData]=SubSUM;
// Stmp[iNeuron]=SubSUM;
__syncthreads(); // ДождемÑÑ Ð¾ÐºÐ¾Ð½Ñ‡Ð°Ð½Ð¸Ñ Ð²Ñех потоков
float SUM;
SUM=0;
if (iDim==0) // Один из потоков берет на ÑÐµÐ±Ñ Ñ„ÑƒÐ½ÐºÑ†Ð¸ÑŽ ÑуммированиÑ
{ // Ð’Ñех временных Ñумм
for (int k=0; kNumThreadDim;k++) // По вÑем временным Ñуммам
{
SUM=SUM+ Stmp[(iSomParam->NumThreadDim+k)Param->SizeSOMParam->NumThreadData+iNeuronParam->NumThreadData+iData] ;
// SUM=SUM+ Stmp[iNeuron];
}
allbmus[(dataindex*Param->NumSOM+iSom)*Param->SizeSOM+iNeuron]=sqrtf(SUM);
}
}
}
///////////////////////////////////////////////////////////////////////////
global void GetBMU(float *CodeBooks,float *allbmus,int *bmus,float *QErr, SParam *Param)
{
//shared
float Min;
float tmpS;
const int iNeuron=threadIdx.x;
const int iSom=blockIdx.x;
if (iNeuron==0)
{
tmpS=0;
for (int i = 0; iDataLen;i++)
{
Min=3e+8;
for (int k=0;kSizeSOM;k++)
{
float d=allbmus[(i*Param->NumSOM+iSom)*Param->SizeSOM+k];
if (d<=Min){ Min=d; bmus[i*Param->NumSOM+iSom]=k+1;
}
}
tmpS=tmpS+Min;
}
QErr[iSom]=tmpS/Param->DataLen;
}
}
///////////////////////////////////////////////////////////////////////////
global void GetBMUDataParallel(float *CodeBooks,float *allbmus,int *bmus,float *QErr,float *TmpSum3, SParam *Param)
{
//shared
float Min;
float tmpS;
const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
const int iData=threadIdx.y;
tmpS=0;
for (int i = 0; iSizeBlockData;i++) // паралельноÑть по данным
{
Min=3e+8;
for (int k=0;kSizeSOM;k++)
{
float d=allbmus[((iData*Param->SizeBlockData+i)*Param->NumSOM+iSom)Param->SizeSOM+k];
if (d<=Min){ Min=d; bmus[(iDataParam->SizeBlockData+i)*Param->NumSOM+iSom]=k+1; }
}
tmpS=tmpS+Min;
}
TmpSum3[iData*Param->NumSOM+iSom]=tmpS/Param->SizeBlockData;
__syncthreads(); // wait all threads
if (iData==0)
{
tmpS=0;
for (int i = 0; i < Param->NumThreadData; i++)
tmpS=tmpS+TmpSum3[i*Param->NumSOM+iSom];
QErr[iSom]=tmpS/Param->NumThreadData;
}
}
//////////////////////////////////////////////////////////////////////////
device float funh(float radius,float *Dists,int x,int y,SParam *Param )
{
return expf(-Dists[Param->SizeSOM*x+y]/(2*radius));
}
global void UpdateNurons(float *CodeBooks,float *Data,int *bmus,float *QErr,float *UDist,float *TmpSum,float *TmpSum2, SParam *Param,float radius)
{
const int iNeuron=threadIdx.x;
const int iSom=blockIdx.x;
float sumz;
float sumc;
sumc=0;
sumz=0;
for (int i=0;iDataLen;i++)
{
if (i==0) {
for (int k=0;k<Param->Dim;k++)
{
int index=(iSom*Param->Dim+k)*Param->SizeSOM+iNeuron;
TmpSum[index]=0;
TmpSum2[index]=0;
}
}
if (iNeuron==bmus[i*Param->NumSOM+iSom]-1)
{
for (int k=0;k<Param->Dim;k++)
{
int index=(iSom*Param->Dim+k)*Param->SizeSOM+iNeuron;
TmpSum[index]=TmpSum[index]+Data[k*Param->DataLen+i];
TmpSum2[index]++;
}
}
}
for (int i=0;iDim;i++)
{
sumc=0;
sumz=0;
for (int k=0;kSizeSOM;k++)
{
int index=(iSom*Param->Dim+i)*Param->SizeSOM+k;
//////////////////////////////////////////////////////////////////////////
//sumc=TmpSum[index];
sumc=sumc+funh(radius,UDist,iNeuron,k,Param)*TmpSum[index];
sumz=sumz+funh(radius,UDist,iNeuron,k,Param)*TmpSum2[index];
/////////////////////////////////////////////////////////////////////////
}
if (sumz>0) CodeBooks[(iSom*Param->Dim+i)*Param->SizeSOM+iNeuron]=sumc/sumz;
}
}
///////////////////////////////////////////////////////////////////////////
global void UpdateNuronsDataParallel(float *CodeBooks,float *Data,int *bmus,float *QErr,float *UDist,float *TmpSum,float *TmpSum2, SParam *Param,float radius)
{
const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
const int iData=threadIdx.y;
float sumz;
float sumc;
sumc=0;
sumz=0;
for (int i=0;iSizeBlockData;i++)
{
if (i==0)
{
// U[iData]=0;
for (int k=0;kDim;k++)
{
//int index=(iSomParam->Dim+k)Param->SizeSOM+iNeuron;
int index =(iSomParam->Dim+k)Param->NumThreadDataParam->SizeSOM+Param->SizeSOMiData+iNeuron;
TmpSum[index]=0;
TmpSum2[index]=0;
}
}
// if (iNeuron==bmus[i*Param->NumSOM+iSom]-1)
if (iNeuron==bmus[(iDataParam->SizeBlockData+i)Param->NumSOM+iSom]-1)
{
for (int k=0;kDim;k++)
{
// U[iData]=11;
// int index=(iSomParam->Dim+k)Param->SizeSOM+iNeuron;
// kParam->DataLen+i
int index = (iSomParam->Dim+k)Param->NumThreadDataParam->SizeSOM+Param->SizeSOMiData+iNeuron;
int dataindex = kParam->DataLen+Param->SizeBlockData*iData + i;
TmpSum[index]=TmpSum[index]+Data[dataindex];
TmpSum2[index]++;
}
}
}
__syncthreads();
if (iData==0)
{
for (int k=0; kDim;k++)
{
for (int i=1; i < Param->NumThreadData; i++)
{
int indexsave = (iSom*Param->Dim+k)*Param->NumThreadData*Param->SizeSOM+Param->SizeSOM*iData+iNeuron;
int indexwhatsave = (iSom*Param->Dim+k)*Param->NumThreadData*Param->SizeSOM+Param->SizeSOM*i+iNeuron;
TmpSum[indexsave]= TmpSum[indexsave]+TmpSum[indexwhatsave];
TmpSum2[indexsave]= TmpSum2[indexsave]+TmpSum2[indexwhatsave];
}
}
for (int i=0;iDim;i++)
{
sumc=0;
sumz=0;
for (int k=0;kSizeSOM;k++)
{
// int index=(iSomParam->Dim+i)Param->SizeSOM+k;
int index = (iSomParam->Dim+i)Param->NumThreadDataParam->SizeSOM+Param->SizeSOMiData+k;
//////////////////////////////////////////////////////////////////////////
//sumc=TmpSum[index];
sumc=sumc+funh(radius,UDist,iNeuron,k,Param)*TmpSum[index];
sumz=sumz+funh(radius,UDist,iNeuron,k,Param)*TmpSum2[index];
/////////////////////////////////////////////////////////////////////////
}
if (sumz>0) CodeBooks[(iSom*Param->Dim+i)*Param->SizeSOM+iNeuron]=sumc/sumz;
}
}
}
///////////////////////////////////////////////////////////////////////////
global void UpdateNuronsDimDataParallel(float *CodeBooks,float *Data,int *bmus,float *QErr,float *UDist,float *TmpSum,float *TmpSum2, SParam *Param,float radius)
{
const int iSom=blockIdx.x;
const int iNeuron=threadIdx.x;
const int iData=threadIdx.y;
const int iDim=threadIdx.z;
float sumz;
float sumc;
sumc=0;
sumz=0;
for (int i=0;iSizeBlockData;i++)
{
if (i==0)
{
// U[iData]=0;
for (int k=0;kSizeBlockDim;k++)
{
//int index=(iSomParam->Dim+k)Param->SizeSOM+iNeuron;
//int index =(iSomParam->Dim+k)Param->NumThreadDataParam->SizeSOM+Param->SizeSOMiData+iNeuron;
int index =(iSomParam->Dim+iDimParam->SizeBlockDim+k)Param->NumThreadDataParam->SizeSOM+Param->SizeSOM*iData+iNeuron;
TmpSum[index]=0;
TmpSum2[index]=0;
}
}
// if (iNeuron==bmus[i*Param->NumSOM+iSom]-1)
if (iNeuron==bmus[(iDataParam->SizeBlockData+i)Param->NumSOM+iSom]-1)
{
for (int k=0;kSizeBlockDim;k++)
{
// U[iData]=11;
// int index=(iSomParam->Dim+k)Param->SizeSOM+iNeuron;
// kParam->DataLen+i
int index = (iSomParam->Dim+iDimParam->SizeBlockDim+k)Param->NumThreadDataParam->SizeSOM+Param->SizeSOMiData+iNeuron;
int dataindex = (iDim*Param->SizeBlockDim+k)Param->DataLen+Param->SizeBlockDataiData + i;
TmpSum[index]=TmpSum[index]+Data[dataindex];
TmpSum2[index]++;
}
}
}
__syncthreads();
if (iData==0)
{
for (int k=0;kSizeBlockDim;k++)
{
for (int i=1; i < Param->NumThreadData; i++)
{
int indexsave = (iSom*Param->Dim+iDim*Param->SizeBlockDim+k)*Param->NumThreadData*Param->SizeSOM+Param->SizeSOM*iData+iNeuron;
int indexwhatsave = (iSom*Param->Dim+iDim*Param->SizeBlockDim+k)*Param->NumThreadData*Param->SizeSOM+Param->SizeSOM*i+iNeuron;
TmpSum[indexsave]= TmpSum[indexsave]+TmpSum[indexwhatsave];
TmpSum2[indexsave]= TmpSum2[indexsave]+TmpSum2[indexwhatsave];
}
}
__syncthreads();
for (int i=0;iSizeBlockDim;i++)
{
sumc=0;
sumz=0;
for (int k=0;kSizeSOM;k++)
{
// int index=(iSomParam->Dim+i)Param->SizeSOM+k;
int index = (iSomParam->Dim+iDimParam->SizeBlockDim+i)Param->NumThreadDataParam->SizeSOM+Param->SizeSOM*iData+k;
//////////////////////////////////////////////////////////////////////////
//sumc=TmpSum[index];
sumc=sumc+funh(radius,UDist,iNeuron,k,Param)*TmpSum[index];
sumz=sumz+funh(radius,UDist,iNeuron,k,Param)*TmpSum2[index];
/////////////////////////////////////////////////////////////////////////
}
if (sumz>0) CodeBooks[(iSomParam->Dim+iDimParam->SizeBlockDim+i)*Param->SizeSOM+iNeuron]=sumc/sumz;
}
}
}
///////////////////////////////////////////////////////////////////////////
void mexFunction( int nlhs, mxArray *plhs,
int nrhs, const mxArray *prhs)
{
float *radius;
float StartRadius;
float EndRadius;
int trainlen;
Param = (SParam*) mxCalloc( 1,sizeof(SParam) );
Param->NumSOM = ((int *)mxGetData(prhs[1]))[0];
Param->SizeMSOM = ((int *)mxGetData(prhs[1]))[1];
Param->SizeNSOM = ((int )mxGetData(prhs[1]))[2];
Param->SizeSOM=Param->SizeMSOMParam->SizeNSOM;
Param->Dim = mxGetN(prhs[2]);
Param->DataLen = mxGetM(prhs[2]);
StartRadius = ((float *)mxGetData(prhs[4]))[0];
EndRadius = ((float *)mxGetData(prhs[4]))[1];
trainlen = ((int *)mxGetData(prhs[5]))[0];
Param->NumThreadDim = int(((int *)mxGetData(prhs[6]))[1]);
Param->NumThreadData = int(((int *)mxGetData(prhs[6]))[0]);
Param->SizeBlockData = Param->DataLen / Param->NumThreadData;
Param->SizeBlockDim = Param->Dim / Param->NumThreadDim;
radius = (float*) mxCalloc( trainlen ,sizeof(float) );
float h=(StartRadius-EndRadius)/(trainlen-1);
for (int i=0;i<trainlen;i++)
{
radius[i]=(StartRadius-ih)(StartRadius-i*h);
if (radius[i]==0) radius[i]=1e-10;
}
cuInit(0);
//int z=cuDeviceTotalMem();
int cdev[10];
int devcount;
char nameDevice[30];
unsigned int TotalRAM[10];
CUdevice dev;
CUdevprop prop;
cuDeviceGetCount(cdev);
cuDeviceGet(&dev,0);
cuDeviceGetName(nameDevice,30,dev);
cuDeviceTotalMem(TotalRAM,dev);
cuDeviceGetProperties(&prop,dev);
mexPrintf(“Device Info \n”);
mexPrintf(“Device count = %i \n”,cdev[0]);
mexPrintf(“Device name = %s \n”,nameDevice);
mexPrintf(“Total RAM = %i Mb \n”,1+TotalRAM[0]/1024/1024);
mexPrintf(“MaxThreadsPerBlock %i\n”,prop.maxThreadsPerBlock);
mexPrintf(“MaxThreadsDim( X =%i, Y =%i , Z =%i ) \n”,prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2]);
mexPrintf(“MaxGridSize( X =%i, Y =%i , Z =%i ) \n”,prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);
mexPrintf(“WarpSize %i\n”,prop.SIMDWidth);
mexPrintf(“Clock Rate %i Mhz \n”, prop.clockRate/1000);
mexPrintf(" ---------------------------------------------- \n");
cudaError_t err=cudaGetLastError();
if (err!=0)
{
printf(“CUDA ERROR %s”,cudaGetErrorString(err));
}
HostAllBmus = (float*) mxCalloc( Param->DataLenParam->NumSOMParam->SizeMSOMParam->SizeNSOM,sizeof(float) );
HostBmus = (int) mxCalloc( Param->DataLenParam->NumSOM,sizeof(int) );
HostCodeBooks = (float) mxCalloc( Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dim,sizeof(float) );
HostQErr = (float) mxCalloc( Param->NumSOM,sizeof(float) );
HostUDist = (float*) mxCalloc( Param->SizeSOM,sizeof(float) );
HostTmpSum = (float*) mxCalloc( Param->NumSOMParam->SizeMSOMParam->SizeNSOM*Param->Dim,sizeof(float) );
cudaMalloc((void**) &DeviceCodeBooks, Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float));
cudaMemcpy(DeviceCodeBooks,(float )mxGetData(prhs[0]), Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float),cudaMemcpyHostToDevice);
cudaMalloc((void**) &DeviceData, Param->DimParam->DataLensizeof(float));
cudaMemcpy(DeviceData,(float )mxGetData(prhs[2]), Param->DimParam->DataLensizeof(float),cudaMemcpyHostToDevice);
cudaMalloc((void**) &DeviceAllBmus, Param->DataLenParam->NumSOMParam->SizeMSOMParam->SizeNSOM sizeof(float));
cudaMalloc((void**) &DeviceBmus, Param->DataLenParam->NumSOM sizeof(int));
cudaMalloc((void**) &DeviceQErr, Param->NumSOM* sizeof(float));
cudaMalloc((void**) &DeviceUDist, Param->SizeSOMParam->SizeSOM sizeof(float));
cudaMalloc((void*) &DeviceTmpSum, Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float));
cudaMalloc((void**) &DeviceTmpSum2, Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dim*sizeof(float));
// Additional for CalculateDimData
cudaMalloc((void**) &DeviceTmpSumDim, Param->NumSOMParam->SizeSOMParam->NumThreadDimParam->NumThreadDatasizeof(float));
// for GetBMU
cudaMalloc((void**) &DeviceTmpSumGetBMUDim, Param->NumSOMParam->NumThreadDatasizeof(float));
// for Update
cudaMalloc((void**) &DeviceTmpSumPr1Update1, Param->NumThreadDataParam->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float));
cudaMalloc((void**) &DeviceTmpSumPr1Update2, Param->NumThreadDataParam->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float));
cudaMemcpy(DeviceUDist,(float )mxGetData(prhs[3]), Param->SizeSOMParam->SizeSOM*sizeof(float),cudaMemcpyHostToDevice);
mexPrintf(“DataLen ( Size of Data ) %i\n”,Param->DataLen);
mexPrintf(“DataDim ( Dimensional Data ) %i\n”,Param->Dim);
mexPrintf(“SOMN%i , SOMM %i ,SizeSOM %i (Size of SOM) \n”,Param->SizeNSOM,Param->SizeMSOM, Param->SizeSOM);
mexPrintf(“NumSOM Parallel %i ( Number of parallel SOM \n”, Param->NumSOM);
mexPrintf(“ThreadData %i (Number of threads for DataLen ) \n”,Param->NumThreadData);
mexPrintf(“ThreadDim %i ( Number of threads for dimensional ) \n”,Param->NumThreadDim);
mexPrintf(“SizeBlockData %i ( Number of blocks for Data) \n”,Param->SizeBlockData);
mexPrintf(“SizeBlockDim %i ( Number of blocks for Dimensional) \n”,Param->SizeBlockDim);
//for (i=0;iNumThreadData;i++)
/* device int *DeviceSizePerDataBlocks;
device int *DeviceSizePerDimBlocks;
*/
// Copy param to device
cudaMalloc((void**) &DevParam, sizeof(SParam));
cudaMemcpy(DevParam,Param, sizeof(SParam),cudaMemcpyHostToDevice);
dim3 threads3(Param->SizeSOM,Param->NumThreadData,Param->NumThreadDim);
dim3 grid3(Param->NumSOM);
dim3 threads2(Param->SizeSOM,Param->NumThreadData);
dim3 grid2(Param->NumSOM);
dim3 threads1(Param->SizeSOM);
dim3 grid1(Param->NumSOM);
//// PARALLELLLLLL PROCESSSS !!!
for (int t=0;t<trainlen;t++)
{
//CalulateAllNeurons <<< grid1,threads1 >>> (DeviceCodeBooks, DeviceData, DeviceAllBmus, DevParam);
//CalulateAllNeuronsDataParallel <<< grid2,threads2 >>> (DeviceCodeBooks, DeviceData, DeviceAllBmus, DevParam);
CalulateAllNeuronsDimDataParallel <<< grid3,threads3 >>> (DeviceCodeBooks, DeviceData, DeviceAllBmus,DeviceTmpSumDim, DevParam);
//----------------------------------------------------------------------------------------------//
// GetBMU <<< grid1, threads1 >>> (DeviceCodeBooks, DeviceAllBmus, DeviceBmus,DeviceQErr, DevParam);
GetBMUDataParallel <<<grid2,threads2 >>>(DeviceCodeBooks, DeviceAllBmus, DeviceBmus,DeviceQErr,DeviceTmpSumGetBMUDim, DevParam);
//----------------------------------------------------------------------------------------------//
//printf(“.\n”);
//UpdateNurons<<< grid1, threads1 >>> (DeviceCodeBooks,DeviceData,DeviceBmus,DeviceQErr,DeviceUDist,DeviceTmpSum,DeviceTmpSum2,DevParam,radius[t]);
//UpdateNuronsDataParallel<<< grid2, threads2 >>> (DeviceCodeBooks,DeviceData,DeviceBmus,DeviceQErr,DeviceUDist,DeviceTmpSumPr1Update1,DeviceTmpSumPr1Update2,DevParam,radius[t]);
UpdateNuronsDimDataParallel<<< grid3, threads3 >>> (DeviceCodeBooks,DeviceData,DeviceBmus,DeviceQErr,DeviceUDist,DeviceTmpSumPr1Update1,DeviceTmpSumPr1Update2,DevParam,radius[t]);
//----------------------------------------------------------------------------------------------//
//cudaMemcpy(HostQErr,DeviceQErr, Param->NumSOM* sizeof(float),cudaMemcpyDeviceToHost);
}
///////////////////////////////////////////////////////////////////////////
cudaMemcpy(HostCodeBooks,DeviceCodeBooks, Param->NumSOMParam->SizeMSOMParam->SizeNSOMParam->Dimsizeof(float),cudaMemcpyDeviceToHost);
plhs[0]=mxCreateNumericMatrix(Param->SizeMSOMParam->SizeNSOM,Param->NumSOMParam->Dim,mxSINGLE_CLASS,0);
mxSetData(plhs[0],HostCodeBooks);
//mxFree(HostCodeBooks);
cudaFree(DeviceAllBmus);
cudaFree(DeviceBmus);
cudaFree(DeviceQErr);
cudaFree(DeviceUDist);
cudaFree(DeviceTmpSum);
cudaFree(DeviceTmpSum2);
cudaFree(DeviceData);
cudaFree(DeviceCodeBooks);
err=cudaGetLastError();
if (err!=0)
{
printf(“CUDA ERROR %s”,cudaGetErrorString(err));
}
}[/font]
Compliation nvmexopts.bat
[font=“Arial”]@echo off
rem MSVC80OPTS.BAT
rem
rem Compile and link options used for building MEX-files
rem using the Microsoft Visual C++ compiler version 8.0
rem
rem $Revision: 1.1.10.2 $ $Date: 2006/06/23 19:04:53 $
rem
rem ********************************************************************
rem General parameters
rem ********************************************************************
set MATLAB=%MATLAB%
set VS80COMNTOOLS=%VS80COMNTOOLS%
set VSINSTALLDIR=C:\Program Files\Microsoft Visual Studio 8
set VCINSTALLDIR=%VSINSTALLDIR%\VC
set PATH=%VCINSTALLDIR%\BIN;%VCINSTALLDIR%\PlatformSDK\bin;%VSINSTALLDIR%\Common7\IDE;%VSINSTALLDIR%\SDK\v2.0\bin;%VSINSTALLDIR%\Common7\Tools;%VSINSTALLDIR%\Common7\Tools\bin;%VCINSTALLDIR%\VCPackages;%MATLAB_BIN%;%PATH%
set INCLUDE=%VCINSTALLDIR%\ATLMFC\INCLUDE;%VCINSTALLDIR%\INCLUDE;%VCINSTALLDIR%\PlatformSDK\INCLUDE;%VSINSTALLDIR%\SDK\v2.0\include;%INCLUDE%
set LIB=%VCINSTALLDIR%\ATLMFC\LIB;%VCINSTALLDIR%\LIB;%VCINSTALLDIR%\PlatformSDK\lib;%VSINSTALLDIR%\SDK\v2.0\lib;%MATLAB%\extern\lib\win32;%LIB%
set MW_TARGET_ARCH=win32
rem ********************************************************************
rem Compiler parameters
rem ********************************************************************
set COMPILER=nvcc
set COMPFLAGS=-c -D_WCHAR_T_DEFINED -Xcompiler “/c /Zp8 /GR /W3 /EHsc- /Zc:wchar_t- /DMATLAB_MEX_FILE /nologo”
set OPTIMFLAGS=-D_WCHAR_T_DEFINED -Xcompiler “/MD /O2 /Oy- /DNDEBUG”
set DEBUGFLAGS=-D_WCHAR_T_DEFINED -Xcompiler “/MD /Zi /Fd”%OUTDIR%%MEX_NAME%%MEX_EXT%.pdb"
set NAME_OBJECT=-o
rem ********************************************************************
rem Linker parameters
rem ********************************************************************
set LIBLOC=%MATLAB%\extern\lib\win32\microsoft
set LINKER=link
rem set LINKER=nvcc
set LINKFLAGS=/dll /export:%ENTRYPOINT% /MAP /LIBPATH:“%LIBLOC%” libmx.lib libmex.lib libmat.lib /implib:%LIB_NAME%.x /MACHINE:X86 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib
set LINKOPTIMFLAGS=
set LINKDEBUGFLAGS=/DEBUG /PDB:“%OUTDIR%%MEX_NAME%%MEX_EXT%.pdb”
set LINK_FILE=
set LINK_LIB=
set NAME_OUTPUT=/out:“%OUTDIR%%MEX_NAME%%MEX_EXT%”
set RSP_FILE_INDICATOR=@
rem ********************************************************************
rem Resource compiler parameters
rem ********************************************************************
set RC_COMPILER=rc /fo “%OUTDIR%mexversion.res”
set RC_LINKER=
set POSTLINK_CMDS=del “%OUTDIR%%MEX_NAME%.map”
set POSTLINK_CMDS1=del %LIB_NAME%.x
set POSTLINK_CMDS2=mt -outputresource:“%OUTDIR%%MEX_NAME%%MEX_EXT%”;2 -manifest “%OUTDIR%%MEX_NAME%%MEX_EXT%.manifest”
set POSTLINK_CMDS3=del “%OUTDIR%%MEX_NAME%%MEX_EXT%.manifest”
[/font]
comp.m
[font=“Times”]nvmex -f nvmexopts.bat gpu_batchsom.cu -IC:\cuda\include -LC:\cuda\lib -lcudart -lcublas -lcuda [/font]
CheckSOM.m
[font=“Arial”]Y=;
Dim=3;
one=[ones(1,Dim)];
for i=1:1:200
p1=normrnd(one,0.5one);
p2=normrnd(5one,0.5*one);
Y=[Y;p1];
Y=[Y;p2];
end
Dim=size(Y,2);
%disp(‘--------------------------------------------------’);
msize=[5 6];
sTopol.msize = msize;
sTopol.type = ‘som_topol’;
sTopol.lattice = ‘rect’;
sTopol.shape = ‘sheet’;
%Codebooks=[sMap1.codebook];
sTmp = som_randinit(Y, ‘msize’, msize,‘sheet’,‘rect’ );
sMap1 = sTmp;
Codebooks=[sMap1.codebook];
Ud = som_unit_dists(sTopol);
Ud = Ud.^2;
GPUCodebooks=gpu_batchsom(single(Codebooks),int32([1 5 6]),single(Y),single(Ud),single([9 0]),int32(20),int32([1 1 ]));
sMap1=som_batchtrain(sMap1,Y,‘radius_ini’,9 ,‘radius_fin’,0,‘trainlen’,20);
disp(‘SOM1’);
som_quality(sMap1,Y)
%sMap1.codebook
Dim
sTmp.codebook=GPUCodebooks(:,1:Dim);
som_quality(sTmp,Y)
[/font]
