submatrix computation

According to slide 23 of thispresentation, I can use a full or submatrix (by providing the pointer to its top-left entry) to perform a general matrix-vector multiplication. While the following worked for my implementation in Intel Math Kernel Library:

sgemv('n', Ndim, 3*numNl, 1.0, &Matrix[3*Nl[numNl-1]*Ndim], max(1, Ndim),

		  forces, 1, 0, disp, 1);

When I tried a nearly-identical function call in cublas, with all of Kinv, forces, and disp located in CUDA memory context:

cublasSgemv('n', Ndim, 3*numNl, 1.0f, &gpu_vars->Kinv[3*Nl[numNl-1]*Ndim],

			max(1, Ndim), gpu_vars->forces, 1, 0.0f, gpu_vars->disp, 1);

The program doesn’t calculate properly. I’m wondering if this scheme doesn’t work. In layman’s terms, I’m trying to use 3 columns of the matrix, each column containing Ndim rows, with the starting index of the submatrix located at 3*Nl[numNl-1]Ndim (i.e. column # 3Nl[numNl-1]). Is this type of operation illegal when done in CUDA? I know I can extract partial data from CUDA memory blocks, but not sure if the same logic can apply here. Thanks in advance for your help.

That won’t work because you cannot manipulate device pointers on the host.

I’ve actually tried something similar on a small scale source code that does exactly that, and it seemed to work:

/* Includes, system */

#include "stdafx.h"

#include "nsim_matrix.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

/* Includes, cuda */

#include <cuda_runtime.h>

#include <cublas.h>

/* Matrix size */

#define N  (5)

/* Main */

int main(int argc, char** argv)

{	

	cublasStatus status;

	cudaError state;

	float* h_A;

	float* h_B;

	float* h_C;

	float* h_result;

	float* h_C_ref;

	float* d_A = 0;

	float* d_B = 0;

	float* d_C = 0;

	float* d_result = 0;

	float alpha = 1.0f;

	float beta = 0.0f;

	int n2 = N * N;

	int i;

	float error_norm;

	float ref_norm;

	float diff;

	/* Initialize CUBLAS */

	printf("simpleCUBLAS test running..\n");

	status = cublasInit();

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! CUBLAS initialization error\n");

		return EXIT_FAILURE;

	}

	/* Allocate host memory for the matrices */

	h_A = (float*)malloc(n2 * sizeof(h_A[0]));

	if (h_A == 0) {

		fprintf (stderr, "!!!! host memory allocation error (A)\n");

		return EXIT_FAILURE;

	}

	h_B = (float*)malloc(n2 * sizeof(h_B[0]));

	if (h_B == 0) {

		fprintf (stderr, "!!!! host memory allocation error (B)\n");

		return EXIT_FAILURE;

	}

	h_C = (float*)malloc(N * sizeof(h_C[0]));

	if (h_C == 0) {

		fprintf (stderr, "!!!! host memory allocation error (C)\n");

		return EXIT_FAILURE;

	}

	h_result = (float*) malloc(N * sizeof(h_result[0]));

	if (h_result == 0) {

		fprintf (stderr, "!!!! host memory allocation error (result)\n");

		return EXIT_FAILURE;

	}

	/* Fill the matrices with test data */

	for (i = 0; i < n2; i++) {

		h_A[i] = rand() % 10 + 1;

		h_B[i] = rand() % 10 + 1;

	}

	for (i = 0; i < N; i++)

		h_C[i] = rand() % 10 + 1;

	/* Allocate device memory for the matrices */

	state = cudaMalloc((void**)&d_A, n2*sizeof(d_A[0]));

	if (state != cudaSuccess) {

		fprintf (stderr, "!!!! device memory allocation error (A)\n");

		return EXIT_FAILURE;

	}

	state = cudaMalloc((void**)&d_B, n2*sizeof(d_B[0]));

	if (state != cudaSuccess) {

		fprintf (stderr, "!!!! device memory allocation error (B)\n");

		return EXIT_FAILURE;

	}

	state = cudaMalloc((void**)&d_C, N*sizeof(d_C[0]));

	if (state != cudaSuccess) {

		fprintf (stderr, "!!!! device memory allocation error (C)\n");

		return EXIT_FAILURE;

	}

	state = cudaMalloc((void**)&d_result, N*sizeof(d_result[0]));

	if (state != cudaSuccess) {

		fprintf (stderr, "!!!! device memory allocation error (result)\n");

		return EXIT_FAILURE;

	}

	/* Initialize the device matrices with the host matrices */

	state = cudaMemcpy(d_A, h_A, n2*sizeof(h_A[0]), cudaMemcpyHostToDevice);

	if (state != cudaSuccess) {

		fprintf (stderr, "!!!! device access error (write A)\n");

		return EXIT_FAILURE;

	}

	state = cudaMemcpy(d_B, h_B, n2*sizeof(h_B[0]), cudaMemcpyHostToDevice);

	if (state != cudaSuccess) {

		fprintf (stderr, "!!!! device access error (write B)\n");

		return EXIT_FAILURE;

	}

	state = cudaMemcpy(d_C, h_C, N*sizeof(h_C[0]), cudaMemcpyHostToDevice);

	if (state != cudaSuccess) {

		fprintf (stderr, "!!!! device access error (write C)\n");

		return EXIT_FAILURE;

	}

	/* Performs operation using plain C code */

	//simple_sgemm(N, alpha, h_A, h_B, beta, h_C);

	sgemv('n', N, 1, 1.0, &h_A[3*N], N, h_C, 1, 0.0, h_result, 1);

	h_C_ref = h_C;

	/* Clear last error */

	cublasGetError();

	/* Performs operation using cublas */

	cublasSgemv('n', N, 1, 1.0, &d_A[3*N], N, d_C, 1, 0.0, d_result, 1);

	//cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N);

	status = cublasGetError();

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! kernel execution error.\n");

		return EXIT_FAILURE;

	}

	/* Allocate host memory for reading back the result from device memory */

	h_C = (float*)malloc(N * sizeof(h_C[0]));

	if (h_C == 0) {

		fprintf (stderr, "!!!! host memory allocation error (C)\n");

		return EXIT_FAILURE;

	}

	/* Read the result back */

	state = cudaMemcpy(h_C, d_result, N*sizeof(h_C[0]), cudaMemcpyDeviceToHost);

	if (state != cudaSuccess) {

		fprintf (stderr, "!!!! device access error (read C)\n");

		return EXIT_FAILURE;

	}

	/* Memory clean up */

	free(h_A);

	free(h_B);

	free(h_C);

	free(h_result);

	free(h_C_ref);

	state = cudaFree(d_A);

	if (status != cudaSuccess) {

		fprintf (stderr, "!!!! memory free error (A)\n");

		return EXIT_FAILURE;

	}

	state = cudaFree(d_B);

	if (status != cudaSuccess) {

		fprintf (stderr, "!!!! memory free error (B)\n");

		return EXIT_FAILURE;

	}

	state = cudaFree(d_C);

	if (status != cudaSuccess) {

		fprintf (stderr, "!!!! memory free error (C)\n");

		return EXIT_FAILURE;

	}

	state = cudaFree(d_result);

	if (status != cudaSuccess) {

		fprintf (stderr, "!!!! memory free error (result)\n");

		return EXIT_FAILURE;

	}

	/* Shutdown */

	status = cublasShutdown();

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! shutdown error (A)\n");

		return EXIT_FAILURE;

	}

	if (argc > 1) {

		if (!strcmp(argv[1], "-noprompt") ||

			!strcmp(argv[1], "-qatest") ) 

		{

			return EXIT_SUCCESS;

		}

	} 

	else

	{

		printf("\nPress ENTER to exit...\n");

		getchar();

	}

	return EXIT_SUCCESS;

}

The problem could be something else, but at least the answer has been found experimentally.

“seemed” is the operative word. There is no magic here. You cannot manipulate device pointers on the host. No experimental evidence, no exceptions other than in emulation mode, when there are no device pointers, just host pointers.