# how to replace a row in matrix by another according to a condition in CUDA

I have two matrices A and B both of size (3*1024). i wrote a small program in CUDAC to replace a row in Matrix A by the corresponding row in Matix B according to another two vectors val1 and val2, as example:

val1: [10 20 30]
val2: [7 17 6]
Matrix A:{1000 …1000,1000 …1000,1000 …1000}
Matrix B:{500 …500,500 …500,500 …500}

and i want the result for matrix A to be:

Matrix A:{500 …500,1000 …1000,500 …500}

this is the code:

``````#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void Replacement2(float * oldval, float * newval, const int D, float *  A, const float *  B) {

int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int index = col + row * D;

if (newval[row] < oldval[row]) {
A[index] = B[index];
oldval[row] = newval[row];
}

}
int main()
{

unsigned int size_old = 3 * 1024;
unsigned int mem_size_old = sizeof(float)* size_old;

unsigned int size_new = 3 * 1024;
unsigned int mem_size_new = sizeof(float)* size_new;

unsigned int sobj = sizeof(float)* 3;

unsigned int snobj = sizeof(float)* 3;

const float h_old_data[] = {1000 .......1000,1000 .......1000,1000 .......1000};
float* h_old = (float*)memcpy(new float[3*1024], h_old_data, sizeof(h_old_data));
const float h_new_data[] = {500 .......500,500 .......500,500 .......500};
float* h_new = (float*)memcpy(new float[3*1024], h_new_data, sizeof(h_new_data));

const float h_val[] = { 10, 20, 30};
float* OLDVAL = (float*)memcpy(new float[3], h_val, sizeof(h_val));

const float h_newval[] = { 7.0, 70.0, 7.0};
float* NEWVAL = (float*)memcpy(new float[3], h_newval, sizeof(h_newval));

float* d_oldval;
cudaMalloc((void**)&d_oldval, sizeof(sobj));
cudaMemcpy(d_oldval, h_val, sizeof(sobj), cudaMemcpyHostToDevice);

float* d_newval;
cudaMalloc((void**)&d_newval, sizeof(sobj));
cudaMemcpy(d_newval, h_newval, sizeof(sobj), cudaMemcpyHostToDevice);

float* d_oldvec;
cudaMalloc((void**)&d_oldvec, mem_size_old);
cudaMemcpy(d_oldvec, h_old, mem_size_old, cudaMemcpyHostToDevice);

float* d_newvec;
cudaMalloc((void**)&d_newvec, mem_size_new);
cudaMemcpy(d_newvec, h_new, mem_size_new, cudaMemcpyHostToDevice);

Replacement2 << <dim3(1,3), dim3(1024, 1) >> >(d_oldval, d_newval, 1024, d_oldvec, d_newvec);

cudaMemcpy(h_old, d_oldvec, mem_size_old, cudaMemcpyDeviceToHost);

for (int x1 = 0; x1 < 3; x1++){
for (int y1 = 0; y1 < 1024; y1++)
fprintf(fpout, "%f  ", h_old[x1 * 1024 + y1]);
fprintf(fpout, "\n");
}

return 0;
}
``````

but the replacement happens only in the first row. what is the problem??

I spotted one problem (though it’s likely not the root cause of why the program is not working)

Never call __syncthreads() when not all threads of a block would be taking part in executing the statement.

The __syncthreads() must be outside the if() condition as it depends on the variable row which itself is a function of threadIdx.y.

+1 :)