GPU : Geforce MX110, CUDA version : 12.1
I am getting wrong output while execution of vector addition in cuda and sometimes getting cuda error : no kernel image .
How to resolve this ? (PATH and installation is done correctly.)
Code is given below
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 10
#define MAX_ERR 1e-6
__global__ void vector_add(float* out, float* a, float* b, int n) {
int i = threadIdx.x;
if(i<n)
out[i] = a[i] + b[i];
}
int main() {
float* a, * b, * out;
float* d_a, * d_b, * d_out;
// Allocate host memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
// Initialize host arrays
for (int i = 0; i < N; i++) {
a[i] = 1.0f;
b[i] = 2.0f;
}
// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
// Executing kernel
vector_add <<< 1, N >>> (d_out, d_a, d_b, N);
cudaDeviceSynchronize();
for (int i = 0; i < N; i++) {
printf("a[i]= %f\n", a[i]);
printf("b[i]= %f\n", b[i]);
printf("out[i]= %f\n", out[i]);
}
// Transfer data back to host memory
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
// Verification
for (int i = 0; i < N; i++) {
assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
}
printf("out[0] = %f\n", out[0]);
printf("PASSED\n");
// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
// Deallocate host memory
free(a);
free(b);
free(out);
}