Hmm no I have not.
I made the mistake of not including the X64 compiler which is not installed by default in VS2008 Pro. That has been fixed.
Then I found the project did not have any special directives on compiling cu files.
I have fixed this by the following -
right click on project - custom build rules.
ticking “CUDA API Runtime Build Rule (v3.2)”.
Curiously there are 2 of these, one of which comes from the file
NVCudaRuntimeAPI.rules
the other from the file
NvCudaRuntimeAPI.V3.2.rules
Both give the output as follows when compiling file cuSVMSolver.cu:
1>------ Build started: Project: cuSVMTrain, Configuration: Debug x64 ------
1>Compiling with CUDA Build Rule…
1>“G:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\bin\nvcc.exe” -gencode=arch=compute_10,code="sm_10,compute_10" -gencode=arch=compute_20,code="sm_20,compute_20" --machine 32 -ccbin “g:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin” -Xcompiler “/EHsc /W3 /nologo /O2 /Zi /MT " -I"g:\cuSVM\cuSVMTrain" -I"G:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\include” -maxrregcount=32 --compile -o “x64\Debug/cuSVMSolver.cu.obj” cuSVMSolver.cu
1>nvcc fatal : Don’t know what to do with 'Toolkit/CUDA/v3.2//include -maxrregcount=32 --compile -o x64/Debug/cuSVMSolver.cu.obj g:/cuSVM/cuSVMTrain/cuSVMSolver.cu ’
1>Build log was saved at “file://g:\cuSVM\cuSVMTrain\x64\Debug\BuildLog.htm”
1>cuSVMTrain - 0 error(s), 0 warning(s)
========== Build: 1 succeeded, 0 failed, 0 up-to-date, 0 skipped ==========
I am not certain I should be using “CUDA API Runtime Build Rule (v3.2)”. Maybe I should use “CUDA Driver API Build Rule (v3.2)” in which case I get
1>------ Build started: Project: cuSVMTrain, Configuration: Debug x64 ------
1>Compiling with CUDA Build Rule…
1>“G:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\bin\nvcc.exe” -arch sm_10 -ccbin “g:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin” -I"g:\cuSVM\cuSVMTrain" -I"G:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.2\include" -maxrregcount=32 -m32 -cubin -o “x64\Debug/cuSVMSolver.cubin” cuSVMSolver.cu
1>nvcc fatal : Don’t know what to do with ‘Computing’
1>Build log was saved at “file://g:\cuSVM\cuSVMTrain\x64\Debug\BuildLog.htm”
1>cuSVMTrain - 0 error(s), 0 warning(s)
========== Build: 1 succeeded, 0 failed, 0 up-to-date, 0 skipped ==========
Here is the routine
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <limits.h>
#include <ctype.h>
#include <float.h>
#include <vector>
#include <algorithm>
#include <math.h>
#include "cublas.h"
#include "mex.h"
#include "cuda.h"
#include "cuSVMutil.h"
__constant__ float C;
__constant__ float taumin;
__constant__ float kernelwidth;
template <unsigned int blockSize>
__global__ void FindBJ(float *d_F, float* d_y,float* d_alpha,float* d_KernelCol,float *g_odata,int* g_index,float BIValue, unsigned int n)
{
__shared__ float sdata[blockSize];
__shared__ int ind[blockSize];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid]=-FLT_MAX;
ind[tid]=0;
float temp;
float globaltemp;
float LocalCloseY;
float LocalFarY;
float maxtemp;
float denomclose;
float denomfar=1.f;
while (i < n)
{
LocalCloseY=d_y[i];
LocalFarY=(i+blockSize)<n ? d_y[i+blockSize]:0.f;
denomclose=(2.f-2.f*d_KernelCol[i]);
if(i+blockSize<n){denomfar=(2.f-2.f*d_KernelCol[i+blockSize]);}
denomclose=denomclose<taumin?taumin:denomclose;
denomfar=denomfar<taumin?taumin:denomfar;
maxtemp=
fmaxf(
globaltemp=
(LocalCloseY*d_alpha[i])>(LocalCloseY==1?0:-C) ?
__fdividef(__powf(BIValue+LocalCloseY*d_F[i],2.f),denomclose)
:-FLT_MAX,
i+blockSize<n ?
((LocalFarY*d_alpha[i+blockSize])>(LocalFarY==1?0:-C)?
__fdividef(__powf(BIValue+LocalFarY*d_F[i+blockSize],2.f),denomfar)
:-FLT_MAX)
:-FLT_MAX);
sdata[tid]=fmaxf(temp=sdata[tid],maxtemp);
if (sdata[tid]!=temp)
{
sdata[tid]== globaltemp ? ind[tid]=i : ind[tid]=i+blockSize;
}
i += gridSize;
}
__syncthreads();
if (tid < 128){ if (sdata[tid] < sdata[tid + 128]){ ind[tid]=ind[tid+128];sdata[tid]=sdata[tid+128]; }} __syncthreads();
if (tid < 64){ if (sdata[tid] < sdata[tid + 64]){ ind[tid]=ind[tid+64];sdata[tid]=sdata[tid+64]; }} __syncthreads();
if (tid < 32)
{
if (sdata[tid] <sdata[tid + 32]) {ind[tid]=ind[tid+32];sdata[tid]=sdata[tid+32];}
if (sdata[tid] <sdata[tid + 16]) {ind[tid]=ind[tid+16];sdata[tid]=sdata[tid+16];}
if (sdata[tid] <sdata[tid + 8]) {ind[tid]=ind[tid+8];sdata[tid]=sdata[tid+8];}
if (sdata[tid] <sdata[tid + 4]) {ind[tid]=ind[tid+4];sdata[tid]=sdata[tid+4];}
if (sdata[tid] <sdata[tid + 2]) {ind[tid]=ind[tid+2];sdata[tid]=sdata[tid+2];}
if (sdata[tid] <sdata[tid + 1]) {ind[tid]=ind[tid+1];sdata[tid]=sdata[tid+1];}
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
if (tid == 0) g_index[blockIdx.x] = ind[0];
}
template <unsigned int blockSize>
__global__ void FindBI(float *d_F, float* d_y,float* d_alpha,float *g_odata,int* g_index,unsigned int n)
{
__shared__ float sdata[blockSize];
__shared__ int ind[blockSize];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid]=-FLT_MAX;
ind[tid]=0;
float temp;
float globaltemp;
float LocalCloseY;
float LocalFarY;
float maxtemp;
while (i < n)
{
LocalCloseY=d_y[i];
LocalFarY=(i+blockSize)<n ? d_y[i+blockSize]:0;
maxtemp=
fmaxf(
globaltemp=
(LocalCloseY*d_alpha[i])<(LocalCloseY==1?C:0) ?
-(d_F[i]*LocalCloseY)
:-FLT_MAX,
i+blockSize<n ?
((LocalFarY*d_alpha[i+blockSize])<(LocalFarY==1?C:0) ?
-(d_F[i+blockSize]*LocalFarY)
:-FLT_MAX)
:-FLT_MAX);
sdata[tid]=fmaxf(temp=sdata[tid],maxtemp);
if (sdata[tid]!=temp)
{
sdata[tid]== globaltemp ? ind[tid]=i : ind[tid]=i+blockSize;
}
i += gridSize;
}
__syncthreads();
if (tid < 128){ if (sdata[tid] < sdata[tid + 128]){ ind[tid]=ind[tid+128];sdata[tid]=sdata[tid+128]; }} __syncthreads();
if (tid < 64){ if (sdata[tid] < sdata[tid + 64]){ ind[tid]=ind[tid+64];sdata[tid]=sdata[tid+64]; }} __syncthreads();
if (tid < 32)
{
if (sdata[tid] <sdata[tid + 32]) {ind[tid]=ind[tid+32];sdata[tid]=sdata[tid+32];}
if (sdata[tid] <sdata[tid + 16]) {ind[tid]=ind[tid+16];sdata[tid]=sdata[tid+16];}
if (sdata[tid] <sdata[tid + 8]) {ind[tid]=ind[tid+8];sdata[tid]=sdata[tid+8];}
if (sdata[tid] <sdata[tid + 4]) {ind[tid]=ind[tid+4];sdata[tid]=sdata[tid+4];}
if (sdata[tid] <sdata[tid + 2]) {ind[tid]=ind[tid+2];sdata[tid]=sdata[tid+2];}
if (sdata[tid] <sdata[tid + 1]) {ind[tid]=ind[tid+1];sdata[tid]=sdata[tid+1];}
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
if (tid == 0) g_index[blockIdx.x] = ind[0];
}
template <unsigned int blockSize>
__global__ void FindStoppingJ(float *d_F, float* d_y,float* d_alpha,float *g_odata,unsigned int n)
{
__shared__ float sdata[blockSize];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid]=FLT_MAX;
float LocalCloseY;
float LocalFarY;
while (i < n)
{
LocalCloseY=d_y[i];
LocalFarY=(i+blockSize)<n ? d_y[i+blockSize]:0;
sdata[tid]=
fminf(
sdata[tid],
fminf(
(LocalCloseY*d_alpha[i])>(LocalCloseY==1?0:-C) ?
-(d_F[i]*LocalCloseY)
:FLT_MAX,
i+blockSize<n ?
((LocalFarY*d_alpha[i+blockSize])>(LocalFarY==1?0:-C)?
-(d_F[i+blockSize]*LocalFarY)
:FLT_MAX)
:FLT_MAX));
i += gridSize;
}
__syncthreads();
if (tid < 128){ sdata[tid]=fminf(sdata[tid],sdata[tid+128]);} __syncthreads();
if (tid < 64){ sdata[tid]=fminf(sdata[tid],sdata[tid+64]);} __syncthreads();
if (tid < 32) {
sdata[tid]=fminf(sdata[tid],sdata[tid+32]);
sdata[tid]=fminf(sdata[tid],sdata[tid+16]);
sdata[tid]=fminf(sdata[tid],sdata[tid+8]);
sdata[tid]=fminf(sdata[tid],sdata[tid+4]);
sdata[tid]=fminf(sdata[tid],sdata[tid+2]);
sdata[tid]=fminf(sdata[tid],sdata[tid+1]);
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
__global__ void UpdateF(float * F,float *KernelColI,float* KernelColJ, float* d_y,float deltaalphai,float deltaalphaj,float yi,float yj,int n)
{
int totalThreads,ctaStart,tid;
totalThreads = gridDim.x*blockDim.x;
ctaStart = blockDim.x*blockIdx.x;
tid = threadIdx.x;
int i;
for (i = ctaStart + tid; i < n; i += totalThreads)
{
F[i] = F[i] + yi*d_y[i]*deltaalphai*KernelColI[i]+yj*d_y[i]*deltaalphaj*KernelColJ[i];
}
}
__global__ void RBFFinish(float *KernelCol, const float * KernelDotProd,const float* DotProd,const float* DotProdRow,const int n)
{
int totalThreads,ctaStart,tid;
totalThreads = gridDim.x*blockDim.x;
ctaStart = blockDim.x*blockIdx.x;
tid = threadIdx.x;
int i;
float temp;
for (i = ctaStart + tid; i < n; i += totalThreads)
{
KernelCol[i] = expf(kernelwidth*(DotProd[i]+*DotProdRow-KernelDotProd[i]*2.f));
}
}
inline void RBFKernel(float *d_KernelJ,const int BJIndex,const float *d_x,const float * d_Kernel_InterRow,float *d_KernelDotProd, float *d_SelfDotProd,const int& m,const int& n,const int &nbrCtas,const int& threadsPerCta)
{
cublasSgemv ('n', m, n, 1,d_x, m, d_Kernel_InterRow, 1, 0, d_KernelDotProd, 1);
RBFFinish<<<nbrCtas,threadsPerCta>>>(d_KernelJ, d_KernelDotProd,d_SelfDotProd,d_SelfDotProd+BJIndex,m);
}
inline void CpuMaxInd(float &BIValue, int &BIIndex,const float * value_inter,const int * index_inter,const int n)
{
BIValue=value_inter[0];
BIIndex=index_inter[0];
for(int j=0;j<n;j++)
{
if (value_inter[j]>BIValue)
{
BIValue=value_inter[j];
BIIndex=index_inter[j];
}
}
}
inline void CpuMaxIndSvr(float &BIValue, int &BIIndex, const float * value_inter,const int * index_inter,int n,const int m)
{
BIValue=value_inter[0];
BIIndex=index_inter[0];
for(int j=0;j<n;j++)
{
if (value_inter[j]>BIValue)
{
BIValue=value_inter[j];
BIIndex=j<n/2?index_inter[j]:index_inter[j]+m;
}
}
}
inline void CpuMin(float &SJValue, float * value_inter,int n)
{
SJValue=value_inter[0];
for(int j=0;j<n;j++)
{
if (value_inter[j]<SJValue)
{
SJValue=value_inter[j];
}
}
}
inline void DotProdVector(float * x, float* dotprod,int m, int n)
{
for(int i=0;i<m;i++)
{
dotprod[i]=0;
for(int j=0;j<n;j++)
dotprod[i]+=(x[i+j*m])*(x[i+j*m]);
}
}
inline void IncrementKernelCache(vector<int>& KernelCacheItersSinceUsed,const int &RowsInKernelCache)
{
for(int k=0;k<RowsInKernelCache;k++)
{
KernelCacheItersSinceUsed[k]+=1;
}
}
inline void UpdateAlphas(float& alphai,float& alphaj,const float& Kij,const float& yi,const float& yj,const float& Fi,const float& Fj,const float& C,const float& h_taumin)
{
//This alpha update code is adapted from that in LIBSVM.
//Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
float lambda;
float lambda_denom;
lambda_denom=2.0-2.0*Kij;
if (lambda_denom<h_taumin) {lambda_denom=h_taumin;}
if (yi!=yj)
{
lambda=(-Fi-Fj)/lambda_denom;
float alphadiff=alphai-alphaj;
alphai+=lambda;
alphaj+=lambda;
if(alphadiff > 0)
{
if(alphaj < 0)
{
alphaj = 0;
alphai = alphadiff;
}
}
else
{
if(alphai < 0)
{
alphai = 0;
alphaj = -alphadiff;
}
}
if(alphadiff > 0)
{
if(alphai > C)
{
alphai = C;
alphaj = C - alphadiff;
}
}
else
{
if(alphaj > C)
{
alphaj = C;
alphai = C + alphadiff;
}
}
}
else
{
float alphasum=alphai+alphaj;
lambda=(Fi-Fj)/lambda_denom;
alphai-=lambda;
alphaj+=lambda;
if(alphasum > C)
{
if(alphai > C)
{
alphai = C;
alphaj = alphasum - C;
}
if(alphaj > C)
{
alphaj = C;
alphai = alphasum - C;
}
}
else
{
if(alphaj < 0)
{
alphaj = 0;
alphai = alphasum;
}
if(alphai < 0)
{
alphai = 0;
alphaj = alphasum;
}
}
}
}
extern "C"
void SVRTrain(float *mexalpha,float* beta,float*y,float *x ,float C, float kernelwidth, float eps, int m, int n, float StoppingCrit)
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
mxArray *mexelapsed =mxCreateNumericMatrix(1, 1,mxSINGLE_CLASS, mxREAL);
float * elapsed=(float *)mxGetData(mexelapsed);
cudaEventRecord(start,0);
cublasInit();
int numBlocks=64;
dim3 ReduceGrid(numBlocks, 1, 1);
dim3 ReduceBlock(256, 1, 1);
float h_taumin=0.0001;
mxCUDA_SAFE_CALL(cudaMemcpyToSymbol("taumin", &h_taumin, sizeof(float)));
kernelwidth*=-1;
mxCUDA_SAFE_CALL(cudaMemcpyToSymbol("kernelwidth", &kernelwidth, sizeof(float)));
mxCUDA_SAFE_CALL(cudaMemcpyToSymbol("C", &C, sizeof(float)));
float *alphasvr=new float [2*m];
float *ybinary=new float [2*m];
float *F=new float [2*m];
for(int j=0;j<m;j++)
{
alphasvr[j]=0;
ybinary[j]=1;
F[j]=-y[j]+eps;
alphasvr[j+m]=0;
ybinary[j+m]=-1;
F[j+m]=y[j]+eps;
}
float *SelfDotProd=new float [m];
DotProdVector(x, SelfDotProd,m, n);
int nbrCtas;
int elemsPerCta;
int threadsPerCta;
VectorSplay (m, SAXPY_THREAD_MIN, SAXPY_THREAD_MAX, SAXPY_CTAS_MAX, &nbrCtas, &elemsPerCta,&threadsPerCta);
float * d_x;
float * d_xT;
float * d_alpha;
float* d_y;
float* d_F;
float *d_KernelDotProd;
float *d_SelfDotProd;
float *d_KernelJ;
float *d_KernelI;
float* d_KernelInterRow;
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_x, m*n*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_xT, m*n*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMemcpy(d_x, x, sizeof(float)*n*m,cudaMemcpyHostToDevice));
dim3 gridtranspose(ceil((float)m / TRANS_BLOCK_DIM), ceil((float)n / TRANS_BLOCK_DIM), 1);
dim3 threadstranspose(TRANS_BLOCK_DIM, TRANS_BLOCK_DIM, 1);
cudaThreadSynchronize();
transpose<<< gridtranspose, threadstranspose >>>(d_xT, d_x, m, n);
float *xT=new float [n*m];
mxCUDA_SAFE_CALL(cudaMemcpy(xT, d_xT, sizeof(float)*m*n,cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaFree(d_xT));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_KernelInterRow, n*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_alpha, 2*m*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_y, 2*m*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_F, 2*m*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_SelfDotProd, m*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_KernelDotProd, m*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMemcpy(d_y, ybinary, sizeof(float)*m*2,cudaMemcpyHostToDevice));
mxCUDA_SAFE_CALL(cudaMemcpy(d_alpha, alphasvr, sizeof(float)*m*2,cudaMemcpyHostToDevice));
mxCUDA_SAFE_CALL(cudaMemcpy(d_F, F, sizeof(float)*m*2,cudaMemcpyHostToDevice));
mxCUDA_SAFE_CALL(cudaMemcpy(d_SelfDotProd, SelfDotProd, sizeof(float)*m,cudaMemcpyHostToDevice));
delete [] F;
delete [] SelfDotProd;
float* value_inter;
int* index_inter;
float* value_inter_svr;
int* index_inter_svr;
cudaMallocHost( (void**)&value_inter, numBlocks*sizeof(float) );
cudaMallocHost( (void**)&index_inter, numBlocks*sizeof(int) );
cudaMallocHost( (void**)&value_inter_svr, 2*numBlocks*sizeof(float) );
cudaMallocHost( (void**)&index_inter_svr, 2*numBlocks*sizeof(int) );
float* d_value_inter;
int* d_index_inter;
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_value_inter, numBlocks*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_index_inter, numBlocks*sizeof(int)));
unsigned int free, total;
cuMemGetInfo(&free, &total);
int KernelCacheSize=free-MBtoLeave*1024*1024;
int RowsInKernelCache=KernelCacheSize/(sizeof(float)*m);
float *d_Kernel_Cache;
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_Kernel_Cache, KernelCacheSize));
vector<int> KernelCacheIndices(RowsInKernelCache,-1);
vector<int> KernelCacheItersSinceUsed(RowsInKernelCache,0);
vector<int>::iterator CachePosI;
vector<int>::iterator CachePosJ;
int CacheDiffI;
int CacheDiffJ;
int CheckStoppingCritEvery=255;
int iter=0;
float BIValue;
int BIIndex;
float SJValue;
float BJSecondOrderValue;
int BJIndex;
float Kij;
float yj;
float yi;
float alphai;
float alphaj;
float oldalphai;
float oldalphaj;
float Fi;
float Fj;
while (1)
{
FindBI<256><<<ReduceGrid, ReduceBlock>>>(d_F, d_y,d_alpha,d_value_inter,d_index_inter, 2*m);
mxCUDA_SAFE_CALL(cudaMemcpy(value_inter, d_value_inter, sizeof(float)*numBlocks,cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(index_inter, d_index_inter, sizeof(int)*numBlocks,cudaMemcpyDeviceToHost));
cudaThreadSynchronize();
CpuMaxInd(BIValue,BIIndex,value_inter,index_inter,numBlocks);
if ((iter & CheckStoppingCritEvery)==0)
{
FindStoppingJ<256><<<ReduceGrid, ReduceBlock>>>(d_F, d_y,d_alpha,d_value_inter, 2*m);
mxCUDA_SAFE_CALL(cudaMemcpy(value_inter, d_value_inter, sizeof(float)*numBlocks,cudaMemcpyDeviceToHost));
cudaThreadSynchronize();
CpuMin(SJValue,value_inter,numBlocks);
if(BIValue-SJValue<StoppingCrit) {*beta=(SJValue+BIValue)/2; break;}
}
CachePosI=find(KernelCacheIndices.begin(),KernelCacheIndices.end(),(BIIndex>=m?BIIndex-m:BIIndex));
if (CachePosI ==KernelCacheIndices.end())
{
CacheDiffI=max_element(KernelCacheItersSinceUsed.begin(),KernelCacheItersSinceUsed.end())-KernelCacheItersSinceUsed.begin();
d_KernelI=d_Kernel_Cache+CacheDiffI*m;
mxCUDA_SAFE_CALL(cudaMemcpy(d_KernelInterRow, xT+(BIIndex>=m?BIIndex-m:BIIndex)*n, n*sizeof(float),cudaMemcpyHostToDevice));
RBFKernel(d_KernelI,(BIIndex>=m?BIIndex-m:BIIndex),d_x,d_KernelInterRow,d_KernelDotProd,d_SelfDotProd,m,n,nbrCtas,threadsPerCta);
*(KernelCacheIndices.begin()+CacheDiffI)=(BIIndex>=m?BIIndex-m:BIIndex);
}
else
{
CacheDiffI=CachePosI-KernelCacheIndices.begin();
d_KernelI=d_Kernel_Cache+m*CacheDiffI;
}
*(KernelCacheItersSinceUsed.begin()+CacheDiffI)=-1;
FindBJ<256><<<ReduceGrid, ReduceBlock>>>(d_F, d_y,d_alpha,d_KernelI,d_value_inter,d_index_inter,BIValue, m);
mxCUDA_SAFE_CALL(cudaMemcpy(value_inter_svr, d_value_inter, sizeof(float)*numBlocks,cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(index_inter_svr, d_index_inter, sizeof(int)*numBlocks,cudaMemcpyDeviceToHost));
FindBJ<256><<<ReduceGrid, ReduceBlock>>>(d_F+m, d_y+m,d_alpha+m,d_KernelI,d_value_inter,d_index_inter,BIValue,m);
mxCUDA_SAFE_CALL(cudaMemcpy(value_inter_svr+numBlocks, d_value_inter, sizeof(float)*numBlocks,cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(index_inter_svr+numBlocks, d_index_inter, sizeof(int)*numBlocks,cudaMemcpyDeviceToHost));
cudaThreadSynchronize();
CpuMaxIndSvr(BJSecondOrderValue,BJIndex,value_inter_svr,index_inter_svr,2*numBlocks,m);
mxCUDA_SAFE_CALL(cudaMemcpy(&Kij, d_KernelI+(BJIndex>=m?BJIndex-m:BJIndex), sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&alphai, d_alpha+BIIndex, sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&alphaj, d_alpha+BJIndex, sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&yi, d_y+BIIndex, sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&yj, d_y+BJIndex, sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&Fi, d_F+BIIndex, sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&Fj, d_F+BJIndex, sizeof(float),cudaMemcpyDeviceToHost));
oldalphai=alphai;
oldalphaj=alphaj;
UpdateAlphas(alphai,alphaj,Kij,yi,yj,Fi,Fj,C,h_taumin);
mxCUDA_SAFE_CALL(cudaMemcpy(d_alpha+BIIndex, &alphai, sizeof(float),cudaMemcpyHostToDevice));
mxCUDA_SAFE_CALL(cudaMemcpy(d_alpha+BJIndex, &alphaj, sizeof(float),cudaMemcpyHostToDevice));
float deltaalphai = alphai - oldalphai;
float deltaalphaj = alphaj - oldalphaj;
CachePosJ=find(KernelCacheIndices.begin(),KernelCacheIndices.end(),(BJIndex>=m?BJIndex-m:BJIndex));
if (CachePosJ ==KernelCacheIndices.end())
{
CacheDiffJ=max_element(KernelCacheItersSinceUsed.begin(),KernelCacheItersSinceUsed.end())-KernelCacheItersSinceUsed.begin();
d_KernelJ=d_Kernel_Cache+CacheDiffJ*m;
mxCUDA_SAFE_CALL(cudaMemcpy(d_KernelInterRow, xT+(BJIndex>=m?BJIndex-m:BJIndex)*n, n*sizeof(float),cudaMemcpyHostToDevice));
RBFKernel(d_KernelJ,(BJIndex>=m?BJIndex-m:BJIndex),d_x,d_KernelInterRow,d_KernelDotProd,d_SelfDotProd,m,n,nbrCtas,threadsPerCta);
*(KernelCacheIndices.begin()+CacheDiffJ)=(BJIndex>=m?BJIndex-m:BJIndex);
}
else
{
CacheDiffJ=CachePosJ-KernelCacheIndices.begin();
d_KernelJ=d_Kernel_Cache+m*CacheDiffJ;
}
UpdateF<<<nbrCtas,threadsPerCta>>>(d_F,d_KernelI,d_KernelJ,d_y,deltaalphai,deltaalphaj,yi,yj,m);
UpdateF<<<nbrCtas,threadsPerCta>>>(d_F+m,d_KernelI,d_KernelJ,d_y+m,deltaalphai,deltaalphaj,yi,yj,m);
IncrementKernelCache(KernelCacheItersSinceUsed,RowsInKernelCache);
*(KernelCacheItersSinceUsed.begin()+CacheDiffI)=0;
*(KernelCacheItersSinceUsed.begin()+CacheDiffJ)=0;
iter++;
}
cublasGetVector(m*2,sizeof(float),d_alpha,1,alphasvr,1);
for(int k=0;k<m;k++)
{
mexalpha[k]=(alphasvr[k]-alphasvr[k+m])*ybinary[k];
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(elapsed, start, stop);
mexPutVariable("base","cuSVMTrainTimeInMS",mexelapsed);
delete [] ybinary;
delete [] alphasvr;
delete [] xT;
cudaFreeHost(value_inter_svr);
cudaFreeHost(index_inter_svr);
cudaFreeHost(value_inter);
cudaFreeHost(index_inter);
mxCUDA_SAFE_CALL(cudaFree(d_x));
mxCUDA_SAFE_CALL(cudaFree(d_y));
mxCUDA_SAFE_CALL(cudaFree(d_alpha));
mxCUDA_SAFE_CALL(cudaFree(d_Kernel_Cache));
mxCUDA_SAFE_CALL(cudaFree(d_KernelInterRow));
mxCUDA_SAFE_CALL(cudaFree(d_F));
mxCUDA_SAFE_CALL(cudaFree(d_value_inter));
mxCUDA_SAFE_CALL(cudaFree(d_index_inter));
mxCUDA_SAFE_CALL(cudaFree(d_SelfDotProd));
mxCUDA_SAFE_CALL(cudaFree(d_KernelDotProd));
mxCUDA_SAFE_CALL( cudaThreadExit());
return;
}
extern "C"
void SVMTrain(float *mexalpha,float* beta,float*y,float *x ,float C, float kernelwidth, int m, int n, float StoppingCrit)
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
mxArray *mexelapsed =mxCreateNumericMatrix(1, 1,mxSINGLE_CLASS, mxREAL);
float * elapsed=(float *)mxGetData(mexelapsed);
cudaEventRecord(start,0);
int numBlocks=64;
dim3 ReduceGrid(numBlocks, 1, 1);
dim3 ReduceBlock(256, 1, 1);
float h_taumin=0.0001;
mxCUDA_SAFE_CALL(cudaMemcpyToSymbol("taumin", &h_taumin, sizeof(float)));
kernelwidth*=-1;
mxCUDA_SAFE_CALL(cudaMemcpyToSymbol("kernelwidth", &kernelwidth, sizeof(float)));
mxCUDA_SAFE_CALL(cudaMemcpyToSymbol("C", &C, sizeof(float)));
float *h_alpha=new float [m];
float *h_F=new float [m];
for(int j=0;j<m;j++)
{
h_alpha[j]=0;
h_F[j]=-1;
}
float *SelfDotProd=new float [m];
DotProdVector(x, SelfDotProd,m, n);
int nbrCtas;
int elemsPerCta;
int threadsPerCta;
VectorSplay (m, SAXPY_THREAD_MIN, SAXPY_THREAD_MAX, SAXPY_CTAS_MAX, &nbrCtas, &elemsPerCta,&threadsPerCta);
float * d_x;
float * d_xT;
float * d_alpha;
float* d_y;
float* d_F;
float *d_KernelDotProd;
float *d_SelfDotProd;
float *d_KernelJ;
float *d_KernelI;
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_x, m*n*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_xT, m*n*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMemcpy(d_x, x, sizeof(float)*n*m,cudaMemcpyHostToDevice));
dim3 gridtranspose(ceil((float)m / TRANS_BLOCK_DIM), ceil((float)n / TRANS_BLOCK_DIM), 1);
dim3 threadstranspose(TRANS_BLOCK_DIM, TRANS_BLOCK_DIM, 1);
cudaThreadSynchronize();
transpose<<< gridtranspose, threadstranspose >>>(d_xT, d_x, m, n);
float *xT=new float [n*m];
mxCUDA_SAFE_CALL(cudaMemcpy(xT, d_xT, sizeof(float)*m*n,cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaFree(d_xT));
float* d_KernelInterRow;
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_KernelInterRow, n*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_alpha, m*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_y, m*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_F, m*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_SelfDotProd, m*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_KernelDotProd, m*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMemcpy(d_y, y, sizeof(float)*m,cudaMemcpyHostToDevice));
mxCUDA_SAFE_CALL(cudaMemcpy(d_alpha, h_alpha, sizeof(float)*m,cudaMemcpyHostToDevice));
mxCUDA_SAFE_CALL(cudaMemcpy(d_F, h_F, sizeof(float)*m,cudaMemcpyHostToDevice));
mxCUDA_SAFE_CALL(cudaMemcpy(d_SelfDotProd, SelfDotProd, sizeof(float)*m,cudaMemcpyHostToDevice));
delete [] SelfDotProd;
float* value_inter;
int* index_inter;
cudaMallocHost( (void**)&value_inter, numBlocks*sizeof(float) );
cudaMallocHost( (void**)&index_inter, numBlocks*sizeof(int) );
float* d_value_inter;
int* d_index_inter;
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_value_inter, numBlocks*sizeof(float)));
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_index_inter, numBlocks*sizeof(int)));
unsigned int free, total;
cuMemGetInfo(&free, &total);
int KernelCacheSize=free-MBtoLeave*1024*1024;
int RowsInKernelCache=KernelCacheSize/(sizeof(float)*m);
float *d_Kernel_Cache;
mxCUDA_SAFE_CALL(cudaMalloc( (void**) &d_Kernel_Cache, KernelCacheSize));
vector<int> KernelCacheIndices(RowsInKernelCache,-1);
vector<int> KernelCacheItersSinceUsed(RowsInKernelCache,0);
vector<int>::iterator CachePosI;
vector<int>::iterator CachePosJ;
int CacheDiffI;
int CacheDiffJ;
int CheckStoppingCritEvery=255;
int iter=0;
float BIValue;
int BIIndex;
float SJValue;
float BJSecondOrderValue;
int BJIndex;
float Kij;
float yj;
float yi;
float alphai;
float alphaj;
float oldalphai;
float oldalphaj;
float Fi;
float Fj;
while (1)
{
FindBI<256><<<ReduceGrid, ReduceBlock>>>(d_F, d_y,d_alpha,d_value_inter,d_index_inter, m);
mxCUDA_SAFE_CALL(cudaMemcpy(value_inter, d_value_inter, sizeof(float)*numBlocks,cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(index_inter, d_index_inter, sizeof(int)*numBlocks,cudaMemcpyDeviceToHost));
cudaThreadSynchronize();
CpuMaxInd(BIValue,BIIndex,value_inter,index_inter,numBlocks);
cudaMemcpy(&Fi, d_F+BIIndex, sizeof(float),cudaMemcpyDeviceToHost);
if ((iter & CheckStoppingCritEvery)==0)
{
FindStoppingJ<256><<<ReduceGrid, ReduceBlock>>>(d_F, d_y,d_alpha,d_value_inter, m);
mxCUDA_SAFE_CALL(cudaMemcpy(value_inter, d_value_inter, sizeof(float)*numBlocks,cudaMemcpyDeviceToHost));
cudaThreadSynchronize();
CpuMin(SJValue,value_inter,numBlocks);
if(BIValue-SJValue<StoppingCrit)
{
if(BIValue-SJValue<StoppingCrit) {*beta=(SJValue+BIValue)/2; break;}
}
}
CachePosI=find(KernelCacheIndices.begin(),KernelCacheIndices.end(),BIIndex);
if (CachePosI ==KernelCacheIndices.end())
{
CacheDiffI=max_element(KernelCacheItersSinceUsed.begin(),KernelCacheItersSinceUsed.end())-KernelCacheItersSinceUsed.begin();
d_KernelI=d_Kernel_Cache+CacheDiffI*m;
mxCUDA_SAFE_CALL(cudaMemcpy(d_KernelInterRow, xT+BIIndex*n, n*sizeof(float),cudaMemcpyHostToDevice));
RBFKernel(d_KernelI,BIIndex,d_x,d_KernelInterRow,d_KernelDotProd,d_SelfDotProd,m,n,nbrCtas,threadsPerCta);
*(KernelCacheIndices.begin()+CacheDiffI)=BIIndex;
}
else
{
CacheDiffI=CachePosI-KernelCacheIndices.begin();
d_KernelI=d_Kernel_Cache+m*CacheDiffI;
}
*(KernelCacheItersSinceUsed.begin()+CacheDiffI)=-1;
FindBJ<256><<<ReduceGrid, ReduceBlock>>>(d_F, d_y,d_alpha,d_KernelI,d_value_inter,d_index_inter,BIValue, m);
mxCUDA_SAFE_CALL(cudaMemcpy(value_inter, d_value_inter, sizeof(float)*numBlocks,cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(index_inter, d_index_inter, sizeof(int)*numBlocks,cudaMemcpyDeviceToHost));
cudaThreadSynchronize();
CpuMaxInd(BJSecondOrderValue,BJIndex,value_inter,index_inter,numBlocks);
mxCUDA_SAFE_CALL(cudaMemcpy(&Kij, d_KernelI+BJIndex, sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&alphai, d_alpha+BIIndex, sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&alphaj, d_alpha+BJIndex, sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&yi, d_y+BIIndex, sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&yj, d_y+BJIndex, sizeof(float),cudaMemcpyDeviceToHost));
mxCUDA_SAFE_CALL(cudaMemcpy(&Fj, d_F+BJIndex, sizeof(float),cudaMemcpyDeviceToHost));
oldalphai=alphai;
oldalphaj=alphaj;
UpdateAlphas(alphai,alphaj,Kij,yi,yj,Fi,Fj,C,h_taumin);
mxCUDA_SAFE_CALL(cudaMemcpy(d_alpha+BIIndex, &alphai, sizeof(float),cudaMemcpyHostToDevice));
mxCUDA_SAFE_CALL(cudaMemcpy(d_alpha+BJIndex, &alphaj, sizeof(float),cudaMemcpyHostToDevice));
float deltaalphai = alphai - oldalphai;
float deltaalphaj = alphaj - oldalphaj;
CachePosJ=find(KernelCacheIndices.begin(),KernelCacheIndices.end(),BJIndex);
if (CachePosJ ==KernelCacheIndices.end())
{
CacheDiffJ=max_element(KernelCacheItersSinceUsed.begin(),KernelCacheItersSinceUsed.end())-KernelCacheItersSinceUsed.begin();
d_KernelJ=d_Kernel_Cache+CacheDiffJ*m;
mxCUDA_SAFE_CALL(cudaMemcpy(d_KernelInterRow, xT+BJIndex*n, n*sizeof(float),cudaMemcpyHostToDevice));
RBFKernel(d_KernelJ,BJIndex,d_x,d_KernelInterRow,d_KernelDotProd,d_SelfDotProd,m,n,nbrCtas,threadsPerCta);
*(KernelCacheIndices.begin()+CacheDiffJ)=BJIndex;
}
else
{
CacheDiffJ=CachePosJ-KernelCacheIndices.begin();
d_KernelJ=d_Kernel_Cache+m*CacheDiffJ;
}
UpdateF<<<nbrCtas,threadsPerCta>>>(d_F,d_KernelI,d_KernelJ,d_y,deltaalphai,deltaalphaj,yi,yj,m);
IncrementKernelCache(KernelCacheItersSinceUsed,RowsInKernelCache);
*(KernelCacheItersSinceUsed.begin()+CacheDiffI)=0;
*(KernelCacheItersSinceUsed.begin()+CacheDiffJ)=0;
iter++;
}
cublasGetVector(m,sizeof(float),d_alpha,1,mexalpha,1);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(elapsed, start, stop);
mexPutVariable("base","cuSVMTrainTimeInMS",mexelapsed);
delete [] xT;
cudaFreeHost(value_inter);
cudaFreeHost(index_inter);
mxCUDA_SAFE_CALL(cudaFree(d_x));
mxCUDA_SAFE_CALL(cudaFree(d_y));
mxCUDA_SAFE_CALL(cudaFree(d_alpha));
mxCUDA_SAFE_CALL(cudaFree(d_KernelInterRow));
mxCUDA_SAFE_CALL(cudaFree(d_Kernel_Cache));
mxCUDA_SAFE_CALL(cudaFree(d_F));
mxCUDA_SAFE_CALL(cudaFree(d_value_inter));
mxCUDA_SAFE_CALL(cudaFree(d_index_inter));
mxCUDA_SAFE_CALL(cudaFree(d_SelfDotProd));
mxCUDA_SAFE_CALL(cudaFree(d_KernelDotProd));
mxCUDA_SAFE_CALL(cudaThreadExit());
return;
}
I am able to now build and run the CUDA examples NVIDIA supply in 64 bits. I suspect the problem lies in a change in the CUDA SDK from when the code was originally written…