Dear all,
I have a program like the following:
#include <stdio.h>
#include <stdlib.h>
#include
#include <cuda.h>
using namespace std;
#define BLOCK_SIZE 32
#define GRID_SIZE 14
// Kernel definition
global void VecAdd(float* A, float* C) {
int i = threadIdx.x;
int j = blockIdx.x;
int n = 32*j+i;
asm(".reg .f32\tt1;\n\t" // temp reg t1 t2 t3
".reg .f32\tt2;\n\t"
"ld.global.f32\tt1, [%0];\n\t" //
: : "r" (&A[n]));
for (int m = 0 ; m < 1000000 ; m++) {
asm("ld.global.f32\tt2, [%0];\n\t" : : "r" (&A[n]));
}
}
int main() {
size_t size = BLOCK_SIZEGRID_SIZEsizeof(float);
float* A = (float*)malloc(size);
float* C = (float*)malloc(size);
for (int i = 0 ; i < BLOCK_SIZE*GRID_SIZE ; ++i) {
A[i]=4.0;
}
float* gA;
cudaMalloc((void**)&gA,size);
cudaMemcpy(gA,A,size,cudaMemcpyHostToDevice);
float* gC;
cudaMalloc((void**)&gC,size);
dim3 grid(GRID_SIZE , 1 , 1);
dim3 block(BLOCK_SIZE , 1 , 1);
// Kernel invocation
VecAdd<<<grid , block>>>(gA, gC);
cudaMemcpy(C,gC,size,cudaMemcpyDeviceToHost);
for (int j=0;j<BLOCK_SIZE*GRID_SIZE;++j) {
printf("%f\t%d\n",C[j],j);
}
cudaFree(gA);
cudaFree(gC);
free(A);
free(C);
return 0;
}
In kernal function, there is a for loop. I find that when the loop scale under a certain number(may around 300000000, I don’t remember), the running time of this program will be very short(maybe under 5 seconds). When the scale goes up, the running time will be very long(few minutes, I didn’t wait for its ending). In this program, the loop instruction is ‘ld’. Sometimes, when I try other instructions(for example, ‘cvta’), the program even can’t be compiled successfully(no errors, just keep compiling without ending). What’s wrong with the code? Is there anyone know about it?