I am trying a few things and have the useless code below, which just populates a managed array with the value of “i”, and in the kernel each thread will add its index value “tid”. i and tid are the same value, but one is indexing in host and one is in the device.
#include <iostream>
using namespace std;
__global__ void asm_kernel(float *data, const int LEN)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
float tidf = static_cast <float> (tid);
while(tid < LEN)
{
asm(
"add.f32 %0, %1, %2;"
: "=f"(data[tid])
: "f"(data[tid]), "f"(tidf)
);
tid += gridDim.x * blockDim.x;
tidf = static_cast <float> (tid);
}
}
int main(void)
{
const int LEN = 2000;
float *data;
cudaMallocManaged(&data, LEN * sizeof(float));
cudaMemset(data, 0, LEN * sizeof(float));
for(int i = 0; i < LEN; i++)
data[i] = i;
asm_kernel <<<4, 256>>> (data, LEN);
cudaDeviceSynchronize();
for(int i = 0; i < LEN; i++)
cout << data[i] << endl;
cudaFree(data);
return 0;
}
If in line #15 I replace “f”(tidf) with “r”(tid), that is, passing the integer “tid” instead of the float “tidf”, the value is not added, as the printing on the screen shows.
Is it illegal to use at least one of the operands that is not of the type of the operation even if the type is explicitly specified?