Generate a submatrix

Hi!

I’m starting with CUDA C++.

I am working on the product between matrices, but i need generate a “submatrix”, like this in MATLAB:

a(2:end-1,2:end-1)

I searched in cuBLAS but can not find a function or set of functions to help me solve the task.
I tried with CublasScopy, but I could only copy the first column.

Anyone has worked with this type of submatrices?

If you plan to apply further CUBLAS functions to the submatrix, pretty much all CUBLAS functions that accept matrices can restrict the operation to a submatrix. That is what the parameters lda, ldb, ldc are for. They are the “leading dimension” (number of rows) of the containing matrix, the pointers to the matrices plus the dimensions m, n, k then select the desired sub-matrix. I am fairly sure there are examples of this among the sample programs that ship with CUDA.

Just ran a quick test, and based on that this seems to match MATLAB output;

MATLAB script:

src_rows=int32(1235);
src_columns=int32(2173);

dst_rows=int32(900);
dst_columns=int32(1900);



Src=single(randn(src_rows,src_columns));

Matlab_ans=Src(236:end-100,174:end-100);

adj_zero_based_rows=int32(235);
adj_zero_based_columns=int32(173);

[Gpu_ans]=sub_mat(Src,src_rows,src_columns,dst_rows,dst_columns,adj_zero_based_rows,adj_zero_based_columns);

 max(abs(Matlab_ans(:)-Gpu_ans(:)))

.cpp:

#include <cstring>
#include <cmath>
#include "stdafx.h"
#include "stdio.h"
#include "mex.h"
#include "matrix.h"
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <MMSystem.h>
#pragma comment(lib, "winmm.lib")

#define _DTH cudaMemcpyDeviceToHost
#define _DTD cudaMemcpyDeviceToDevice
#define _HTD cudaMemcpyHostToDevice

bool InitMMTimer(UINT wTimerRes);
void DestroyMMTimer(UINT wTimerRes, bool init);


extern "C" void get_sub_wrap(
			const float *Src,
			float *Dst,
			const int src_rows,
			const int src_columns,
			const int dst_rows,
			const int dst_columns,
			const int adj_rows,
			const int adj_columns);



void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[]){//this will be forward projection only
	
	float *Src=(float *)mxGetPr(prhs[0]);
	const int src_rows=(int)mxGetScalar(prhs[1]);
	const int src_columns=(int)mxGetScalar(prhs[2]);
	const int dst_rows=(int)mxGetScalar(prhs[3]);
	const int dst_columns=(int)mxGetScalar(prhs[4]);
	const int adj_rows=(int)mxGetScalar(prhs[5]);
	const int adj_columns=(int)mxGetScalar(prhs[6]);

	float *D_Src,*D_Dst;
	cudaError_t err=cudaMalloc((void**)&D_Src,src_rows*src_columns*sizeof(float));
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaMalloc((void**)&D_Dst,dst_rows*dst_columns*sizeof(float));
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaMemcpy(D_Src,Src,src_rows*src_columns*sizeof(float),cudaMemcpyHostToDevice);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	get_sub_wrap(D_Src,D_Dst,src_rows,src_columns,dst_rows,dst_columns,adj_rows,adj_columns);

	plhs[0]=mxCreateNumericMatrix(dst_rows,dst_columns,mxSINGLE_CLASS,mxREAL);
	float *result0=(float *)mxGetPr(plhs[0]);

	err=cudaMemcpy(result0,D_Dst,dst_rows*dst_columns*sizeof(float),cudaMemcpyDeviceToHost);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaFree(D_Src);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaFree(D_Dst);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaDeviceReset();
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
}

bool InitMMTimer(UINT wTimerRes){
	TIMECAPS tc;
	if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) {return false;}
	wTimerRes = min(max(tc.wPeriodMin, 1), tc.wPeriodMax);
	timeBeginPeriod(wTimerRes); 
	return true;
}

void DestroyMMTimer(UINT wTimerRes, bool init){
	if(init)
		timeEndPeriod(wTimerRes);
}

.cu:

#include "stdafx.h"
#include "stdio.h"
#include <cuda.h>
#include <math_functions.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define THREADS_BIG 256

__global__ void get_sub(
			const float *Src,
			float *Dst,
			const int src_rows,
			const int src_columns,
			const int dst_rows,
			const int dst_columns,
			const int adj_rows,
			const int adj_columns){
	const int row_loc=threadIdx.x+blockIdx.x*blockDim.x;
	const int col_loc=blockIdx.y;
	if(row_loc<dst_rows){
		Dst[col_loc*dst_rows+row_loc]=Src[(col_loc+adj_columns)*src_rows+(row_loc+adj_rows)];
	}

}

extern "C" void get_sub_wrap(
			const float *Src,
			float *Dst,
			const int src_rows,
			const int src_columns,
			const int dst_rows,
			const int dst_columns,
			const int adj_rows,
			const int adj_columns){
	dim3 grid((dst_rows+THREADS_BIG-1)/THREADS_BIG,dst_columns,1);
	get_sub<<<grid,THREADS_BIG>>>(Src,Dst,src_rows,src_columns,dst_rows,dst_columns,adj_rows,adj_columns);
	cudaError_t err= cudaThreadSynchronize();
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

}

MATLAB test output:

max(abs(Matlab_ans(:)-Gpu_ans(:)))

ans =

     0

>> sub_tst

ans =

     0

>> Matlab_ans(1:3,1:3)

ans =

      -0.1339525        1.312425        1.235296
      -0.5988593        1.197179       -1.668345
       -1.370762        1.107954      -0.8107702

>> Gpu_ans(1:3,1:3)

ans =

      -0.1339525        1.312425        1.235296
      -0.5988593        1.197179       -1.668345
       -1.370762        1.107954      -0.8107702

It is possible there still is a mistake in there somewhere, but given the MATLAB output it seems to be correct, at least for this simple test case.

‘ans’ is the max error between the CUDA version and the MATLAB version. It shows that to be 0, which means they match in this case.

No guarantees to this, just trying to help you out and I wrote this up quickly.

It is up to you to get the inputs correct, otherwise out-of-bounds memory accesses can occur. Do not blame me for such errors.

Also this is using COLUMN MAJOR, like MATLAB. This is not going to be too much faster than MATLAB, unless you are doing lots of large matrices. The memory copy times both directions will probably be more than the kernel time for the operation.

Another less uniform test:

MATLAB script:

src_rows=int32(1235);
src_columns=int32(2173);

dst_rows=int32(900);
dst_columns=int32(1800);

Src=single(randn(src_rows,src_columns));

Matlab_ans=Src(236:end-100,174:end-200);

adj_zero_based_rows=int32(235);
adj_zero_based_columns=int32(173);

[Gpu_ans]=sub_mat(Src,src_rows,src_columns,dst_rows,dst_columns,adj_zero_based_rows,adj_zero_based_columns);

 max(abs(Matlab_ans(:)-Gpu_ans(:)))

MATLAB output and quick sub-section verification:

sub_tst

ans =

     0

>> Matlab_ans(897:end,1797:end)

ans =

        1.258649        0.771333        1.326151      -0.5254724
       0.9676864        2.067348        1.584405       -1.467821
      0.08143418       0.3438582       -1.207133      -0.9501683
        0.239234       -1.657374        1.601898      -0.2766352

>> Gpu_ans(897:end,1797:end)

ans =

        1.258649        0.771333        1.326151      -0.5254724
       0.9676864        2.067348        1.584405       -1.467821
      0.08143418       0.3438582       -1.207133      -0.9501683
        0.239234       -1.657374        1.601898      -0.2766352

NOTE: There would be a more optimal way to write the kernel, but this simple example gives the basic idea.
If you can use cuBLAS to achieve the same thing, specifically in the way njuffa suggested, that would be the better approach.