Hi everyone, I just got 2 problems waiting to be solved. Firstly, I just wrote the following code:
#include <stdio.h>;
#include <stdlib.h>;
#include <iostream>;
#include <cuda.h>;
using namespace std;
#define BLOCK_SIZE 32
#define GRID_SIZE 14
// Kernel definition
__global__ void VecAdd(float* A, float* B, float* C) {
int i = threadIdx.x;
int j = blockIdx.x;
int n = 32*j+i;
/*for (int j = 0 ; j < 100000000 ; j++ ){
C[i] = A[i] + B[i];
}*/
asm(".reg .f32 t1;" // temp reg t1 t2 t3
".reg .f32 t2;"
".reg .f32 t3;"
"ld.global.f32 t1, [%0]; " // t1 =
: : "r" (&A[n]));
asm("ld.global.f32 t2, [%0]; " // t2 =
: : "r" (&B[n]));
for (int m = 0 ; m < 10000000 ; m++) {
asm("add.f32 t3, t1, t2;"
: :);
}
asm("mov.f32 %0, t3;" : "=f"(C[n]));
}
int main() {
size_t size = BLOCK_SIZE*GRID_SIZE*sizeof(float);
float* A = (float*)malloc(size);
float* B = (float*)malloc(size);
float* C = (float*)malloc(size);
for (int i = 0 ; i < BLOCK_SIZE*GRID_SIZE ; ++i) {
A[i]=1.0;
B[i]=3.0;
}
float* gA;
cudaMalloc((void**)&gA,size);
cudaMemcpy(gA,A,size,cudaMemcpyHostToDevice);
float* gB;
cudaMalloc((void**)&gB,size);
cudaMemcpy(gB,B,size,cudaMemcpyHostToDevice);
float* gC;
cudaMalloc((void**)&gC,size);
dim3 grid(GRID_SIZE , 1 , 1);
dim3 block(BLOCK_SIZE , 1 , 1);
// Kernel invocation
VecAdd<<<grid , block>>>(gA, gB, gC);
cudaMemcpy(C,gC,size,cudaMemcpyDeviceToHost);
for (int j=0;j<BLOCK_SIZE*GRID_SIZE;++j) {
printf("%f %d",C[j],j);
}
cudaFree(gA);
cudaFree(gB);
cudaFree(gC);
free(A);
free(B);
free(C);
return 0;
}
1, I found that there is not a positive correlation between the running time of the program and the size of the ‘for’ loop as the following code:
for (int m = 0 ; m < 10000000 ; m++) {
asm("add.f32 t3, t1, t2;"
: :);
}
But if I use the following ‘for’ loop instead of the loop above(using inline PTX assembly):
for (int j = 0 ; j < 100000000 ; j++ ){
C[i] = A[i] + B[i];
}
There is a a positive correlation between the running time and the size of the loop. So the program with inline PTX is wrong?? How could I make it right??
2, I can’t use some function like sleep in kernal. So how could I do if I want to make the program stop for a while before running the ‘for’ loop in the kernal??
Please help me or give me some hints!! Thank you!!