performing computing of 2d array in cuda

Hi, I’m sorry if my question is too obvious. I just learn cuda this morning. So, I have this code in C, and I want to implement cuda to improve the execution time:

void calc(float **src, float **dst, int w, int h)
{
    for(int x = 0; x<w; x++)
    for(int y = 0; y<h; y++)
    {
        int k = 0;
        dst[x][y] = src[y][x] + src[x][y] + src[x][x] + src[y][y];
    }
}

I try to implement this code in cuda, but I’m not sure if I’m doing the right thing:

__global__ void process(hostSrc,src, int w,int h)
{
    //I'm completely clueless what should I put in here
}

void calc(float **src, float **dst, int w, int h)
{
     float* hostSrc, *hostDst;
     size_t pitch;
     cudaMallocPitch(&hostSrc,&pitch, w*sizeof(float), h);
     cudaMallocPitch(&hostDst,&pitch, w*sizeof(float), h);
     cudaMemcpy2D(&hostSrc, &pitch, &src, w*sizeof(float), w*sizeof(float),h,cudaMemcpyHostToDevice);
     cudaMemcpy2D(&hostDst, &pitch, &dst, w*sizeof(float), w*sizeof(float),h,cudaMemcpyHostToDevice);
     dim3 grid, block;
     block.x = ...;
     grid.x = ...;
     process<<<grid,block>>>(hostSrc,src, w,h); //I don't know how many block and thread should I use. 
     cudaMemcpy2D(&dst, w*sizeof(float), &hostDst, &pitch, w*sizeof(float),h, cudaMemcpyDeviceToHost);
     cudaFree(hostSrc); cudaFree(hostDst);
}

can anyone help me? or have any suggestion? thx

If you are going ‘process’ given your cpu function:

dst[y] = src[y] + src[y] + src + src[y][y];

this would mean the width==height, because with a non-square 2-d array, with dimensions lets say 2x3, there is no valid [3][3](or [2][2] with 0-indexing) location, but the code would try to read from that address in error.

here is quick easy non-optimal way of accomplishing the same thing:

#include <algorithm>
#include <iostream>
#include <fstream>
#include <sstream>
#include <utility>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <string>
#include <cmath>
#include <ctime>
#include <cuda.h>
#include <math_functions.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <MMSystem.h>
#pragma comment(lib, "winmm.lib")
#define _CRTDBG_MAP_ALLOC
#include <crtdbg.h>
using namespace std;

#define THREADS 256
#define WIDTH 4096
#define HEIGHT WIDTH

bool InitMMTimer(UINT wTimerRes);
void DestroyMMTimer(UINT wTimerRes, bool init);
void _generate_random_values_float(float *A, const int size);

__global__ void process(const float *src, float *dst, const int N){
	int offset=blockIdx.x*blockDim.x+threadIdx.x;
	int y=offset/N,x=offset%N;
	if(y<N && x<N){
		dst[offset]=src[x*N+y]+src[offset]+src[x*N+x]+src[y*N+y];
	}
}

void calc(const float *src, float *dst, const int N){
	for(int y=0;y<N;y++)for(int x=0;x<N;x++){
		int offset=y*N+x;
		dst[offset]=src[x*N+y]+src[offset]+src[x*N+x]+src[y*N+y];
	}
}

int main(){
	char ch;
	srand(time(NULL));

	const int NN=HEIGHT*WIDTH,lda=max(HEIGHT,WIDTH);
	const int numbytes=NN*sizeof(float);
	float *SRC=(float *)malloc(numbytes);
	float *H_RESULT=(float *)malloc(numbytes);
	float *D_RESULT=(float *)malloc(numbytes);
	float *D_SRC,*D_DST;

	_generate_random_values_float(SRC,NN);
	
	cudaError_t err=cudaMalloc((void **)&D_SRC,numbytes);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaMalloc((void **)&D_DST,numbytes);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	
	UINT wTimerRes = 0;
	bool init = InitMMTimer(wTimerRes);
	DWORD startTime=0,GPUtime=0,CPUtime=0;
	startTime = timeGetTime();

	err=cudaMemcpy(D_SRC,SRC,numbytes,cudaMemcpyHostToDevice);//copy array values from host into device memory
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	
	process<<<((NN+THREADS-1)/THREADS),THREADS>>>(D_SRC,D_DST,lda);
	err = cudaThreadSynchronize();
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaMemcpy(D_RESULT,D_DST,numbytes,cudaMemcpyDeviceToHost);//copy back result to host memory
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	DWORD endTime = timeGetTime();
	GPUtime=endTime-startTime;
	cout<<"\nGPU total time(copy from host, process, copy back result): "<<float(GPUtime)/1000.0f<< "seconds.\n";
	DestroyMMTimer(wTimerRes, init);

	wTimerRes = 0;
	init = InitMMTimer(wTimerRes);
	startTime = timeGetTime();

	calc(SRC,H_RESULT,lda);

	endTime = timeGetTime();
	CPUtime=endTime-startTime;
	cout<<"\nCPU timing: "<<float(CPUtime)/1000.0f<<" seconds.\n";
	DestroyMMTimer(wTimerRes, init);
	float max_dif=0.0f;
	for(int y=0;y<HEIGHT;y++)for(int x=0;x<WIDTH;x++){
		float h=H_RESULT[y*WIDTH+x],d=D_RESULT[y*WIDTH+x];
		max_dif=max(max_dif,fabs(h-d));
	}

	err=cudaFree(D_SRC);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaFree(D_DST);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	free(SRC);
	free(H_RESULT);
	free(D_RESULT);
	cin>>ch;
	return 0;
}

bool InitMMTimer(UINT wTimerRes){
	TIMECAPS tc;
	if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) {return false;}
	wTimerRes = min(max(tc.wPeriodMin, 1), tc.wPeriodMax);
	timeBeginPeriod(wTimerRes); 
	return true;
}

void DestroyMMTimer(UINT wTimerRes, bool init){
	if(init)
		timeEndPeriod(wTimerRes);
}

void _generate_random_values_float(float *A, const int size){
	for(int i=0;i<size;i++){
		A[i]=float(rand()%127)/float((rand()%997)+1);
	}
	cout<<"\nFilled vector array with random float values\n";
}

That seems to be correct, but there are much faster ways to accomplish the same result.
Even this crude version is still faster than cpu version by 6-10x.