If you are going ‘process’ given your cpu function:
dst[y] = src[y] + src[y] + src + src[y][y];
this would mean the width==height, because with a non-square 2-d array, with dimensions lets say 2x3, there is no valid [3][3](or [2][2] with 0-indexing) location, but the code would try to read from that address in error.
here is quick easy non-optimal way of accomplishing the same thing:
#include <algorithm>
#include <iostream>
#include <fstream>
#include <sstream>
#include <utility>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <string>
#include <cmath>
#include <ctime>
#include <cuda.h>
#include <math_functions.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <MMSystem.h>
#pragma comment(lib, "winmm.lib")
#define _CRTDBG_MAP_ALLOC
#include <crtdbg.h>
using namespace std;
#define THREADS 256
#define WIDTH 4096
#define HEIGHT WIDTH
bool InitMMTimer(UINT wTimerRes);
void DestroyMMTimer(UINT wTimerRes, bool init);
void _generate_random_values_float(float *A, const int size);
__global__ void process(const float *src, float *dst, const int N){
int offset=blockIdx.x*blockDim.x+threadIdx.x;
int y=offset/N,x=offset%N;
if(y<N && x<N){
dst[offset]=src[x*N+y]+src[offset]+src[x*N+x]+src[y*N+y];
}
}
void calc(const float *src, float *dst, const int N){
for(int y=0;y<N;y++)for(int x=0;x<N;x++){
int offset=y*N+x;
dst[offset]=src[x*N+y]+src[offset]+src[x*N+x]+src[y*N+y];
}
}
int main(){
char ch;
srand(time(NULL));
const int NN=HEIGHT*WIDTH,lda=max(HEIGHT,WIDTH);
const int numbytes=NN*sizeof(float);
float *SRC=(float *)malloc(numbytes);
float *H_RESULT=(float *)malloc(numbytes);
float *D_RESULT=(float *)malloc(numbytes);
float *D_SRC,*D_DST;
_generate_random_values_float(SRC,NN);
cudaError_t err=cudaMalloc((void **)&D_SRC,numbytes);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaMalloc((void **)&D_DST,numbytes);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
UINT wTimerRes = 0;
bool init = InitMMTimer(wTimerRes);
DWORD startTime=0,GPUtime=0,CPUtime=0;
startTime = timeGetTime();
err=cudaMemcpy(D_SRC,SRC,numbytes,cudaMemcpyHostToDevice);//copy array values from host into device memory
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
process<<<((NN+THREADS-1)/THREADS),THREADS>>>(D_SRC,D_DST,lda);
err = cudaThreadSynchronize();
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaMemcpy(D_RESULT,D_DST,numbytes,cudaMemcpyDeviceToHost);//copy back result to host memory
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
DWORD endTime = timeGetTime();
GPUtime=endTime-startTime;
cout<<"\nGPU total time(copy from host, process, copy back result): "<<float(GPUtime)/1000.0f<< "seconds.\n";
DestroyMMTimer(wTimerRes, init);
wTimerRes = 0;
init = InitMMTimer(wTimerRes);
startTime = timeGetTime();
calc(SRC,H_RESULT,lda);
endTime = timeGetTime();
CPUtime=endTime-startTime;
cout<<"\nCPU timing: "<<float(CPUtime)/1000.0f<<" seconds.\n";
DestroyMMTimer(wTimerRes, init);
float max_dif=0.0f;
for(int y=0;y<HEIGHT;y++)for(int x=0;x<WIDTH;x++){
float h=H_RESULT[y*WIDTH+x],d=D_RESULT[y*WIDTH+x];
max_dif=max(max_dif,fabs(h-d));
}
err=cudaFree(D_SRC);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaFree(D_DST);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
free(SRC);
free(H_RESULT);
free(D_RESULT);
cin>>ch;
return 0;
}
bool InitMMTimer(UINT wTimerRes){
TIMECAPS tc;
if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) {return false;}
wTimerRes = min(max(tc.wPeriodMin, 1), tc.wPeriodMax);
timeBeginPeriod(wTimerRes);
return true;
}
void DestroyMMTimer(UINT wTimerRes, bool init){
if(init)
timeEndPeriod(wTimerRes);
}
void _generate_random_values_float(float *A, const int size){
for(int i=0;i<size;i++){
A[i]=float(rand()%127)/float((rand()%997)+1);
}
cout<<"\nFilled vector array with random float values\n";
}
That seems to be correct, but there are much faster ways to accomplish the same result.
Even this crude version is still faster than cpu version by 6-10x.