Throughput test (add, mul, mod) giving strange result

Hi,

I created a little program to test the throughput for different operations, like add, mul and mulmod in ptx

The problem is that I am getting strange results.

Running with the following settings

dim3 grid(800, 1, 1);

dim3 block(1024, 1, 1);

I get:

add = 5,9s

mul = 2,9s

mulmod = 2,9s

While I am supposed instead to get add<mod<mulmod… why?

/**
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 */
#include <stdio.h>
#include <stdlib.h>

/**
 * This macro checks return value of the CUDA runtime call and exits
 * the application if the call failed.
 */
#define CUDA_CHECK_RETURN(value) {											\
	cudaError_t _m_cudaStat = value;										\
	if (_m_cudaStat != cudaSuccess) {										\
		fprintf(stderr, "Error %s at line %d in file %s\n",					\
				cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);		\
		exit(1);															\
	} }

__global__ void add(void *d, int iteration, bool flag) {

	unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
	unsigned int result;

//	printf("Index %d\n", index);

	for (int i = 0; i < iteration; i++) {

		asm ("{\n\t"
				"add.cc.u32 %0, %1, %1;\n\t"
				"addc.u32  	%0, 0, 0;\n\t"
				"}"
				: "=r"(result) : "r"(index));
	}
	if (flag) {

		unsigned int *data = (unsigned int*) d;
		data[index] = result;
	}
}

__global__ void mod(void *d, int iteration, bool flag) {

	unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
	unsigned int result;

	for (int i = 0; i < iteration; i++) {

		asm ("{\n\t"
				".reg .u64 	         t1;\n\t"
				"cvt.u64.u32     t1, %1;\n\t"
				"rem.u64     t1, t1, t1;\n\t"
				"}"
				: "=r"(result) : "r"(index));
	}
	if (flag) {

		unsigned int *data = (unsigned int*) d;
		data[index] = result;
	}
}

__global__ void mulmod(void *d, int iteration, bool flag) {

	unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
	unsigned int result;

	for (int i = 0; i < iteration; i++) {

		asm ("{\n\t"
		         ".reg .u64 t1;\n\t"
		         ".reg .u64 t2;\n\t"
		         "mul.wide.u32  t1, %0, %0;\n\t"
		         "cvt.u64.u32   t2, %0;\n\t"
		         "rem.u64       t1, t1, t2;\n\t"
		         "}"
		         : "=r"(result) : "r"(index));
	}
	if (flag) {

		unsigned int *data = (unsigned int*) d;
		data[index] = result;
	}
}

/**
 * Host function that prepares data array and passes it to the CUDA kernel.
 */
int main(void) {
	void *d = NULL;
	float milliseconds;

	/**
	 * Number of iterations for each thread
	 */
	int iteration = 1000000;
	/**
	 * Grid configuration, number of blocks on (x, y, z)
	 */
	dim3 grid(800, 1, 1);
	/**
	 * Block configuration, number of threads on (x, y, z)
	 */
	dim3 block(1024, 1, 1);

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	CUDA_CHECK_RETURN(
			cudaMalloc((void** ) &d, sizeof(int) * grid.x * block.x));

	cudaEventRecord(start);

//	add<<<grid, block>>>(d, iteration, false);
//	mod<<<grid, block>>>(d, iteration, false);
	mulmod<<<grid, block>>>(d, iteration, false);

	// Wait for the GPU launched work to complete
	CUDA_CHECK_RETURN(cudaThreadSynchronize());

	cudaEventRecord(stop);

	cudaEventSynchronize(stop);
	cudaEventSynchronize(start);

	CUDA_CHECK_RETURN(cudaGetLastError());

	CUDA_CHECK_RETURN(cudaFree((void* ) d));

	cudaEventElapsedTime(&milliseconds, start, stop);

	printf("Time for the kernel: %f ms\n", milliseconds);

	return 0;
}

In your mul and mulmod code, the result of the operation is not used. Dead code elimination turns it into empty loops. You should (at least) write the output to %0, the register associated with the “result” variable.

Like this?

__global__ void mod(void *d, int iteration, bool flag) {

	unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
	unsigned int result;

	for (int i = 0; i < iteration; i++) {

		asm ("{\n\t"
				".reg .u64 	         t1;\n\t"
				"cvt.u64.u32     t1, %1;\n\t"
				".reg .u64 	         t2;\n\t"
				"cvt.u64.u32     t2, %2;\n\t"
				"rem.u64     t1, t1, t2;\n\t"
				"mov.b32       %0, t1+0;\n\t"
				"}"
				: "=r"(result) : "r"(index), "r"(threadIdx.x));
	}
	if (flag) {

		unsigned int *data = (unsigned int*) d;
		data[index] = result;
	}
}

__global__ void mulmod(void *d, int iteration, bool flag) {

	unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
	unsigned int result;

	for (int i = 0; i < iteration; i++) {

		asm ("{\n\t"
		         ".reg .u64 t1;\n\t"
		         ".reg .u64 t2;\n\t"
		         "mul.wide.u32  t1, %0, %0;\n\t"
		         "cvt.u64.u32   t2, %0;\n\t"
		         "rem.u64       t1, t1, t2;\n\t"
				 "mov.b32       %0, t1+0;\n\t"
		         "}"
		         : "=r"(result) : "r"(index));
	}
	if (flag) {

		unsigned int *data = (unsigned int*) d;
		data[index] = result;
	}
}

I still get 400ms… that should be wrong