I have two matrices A and B both of size (3*1024). i wrote a small program in CUDAC to replace a row in Matrix A by the corresponding row in Matix B according to another two vectors val1 and val2, as example:

val1: [10 20 30]

val2: [7 17 6]

Matrix A:{1000 …1000,1000 …1000,1000 …1000}

Matrix B:{500 …500,500 …500,500 …500}

and i want the result for matrix A to be:

Matrix A:{500 …500,1000 …1000,500 …500}

this is the code:

```
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
__global__ void Replacement2(float * oldval, float * newval, const int D, float * A, const float * B) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int index = col + row * D;
if (newval[row] < oldval[row]) {
A[index] = B[index];
oldval[row] = newval[row];
__syncthreads();
}
}
int main()
{
unsigned int size_old = 3 * 1024;
unsigned int mem_size_old = sizeof(float)* size_old;
unsigned int size_new = 3 * 1024;
unsigned int mem_size_new = sizeof(float)* size_new;
unsigned int sobj = sizeof(float)* 3;
unsigned int snobj = sizeof(float)* 3;
const float h_old_data[] = {1000 .......1000,1000 .......1000,1000 .......1000};
float* h_old = (float*)memcpy(new float[3*1024], h_old_data, sizeof(h_old_data));
const float h_new_data[] = {500 .......500,500 .......500,500 .......500};
float* h_new = (float*)memcpy(new float[3*1024], h_new_data, sizeof(h_new_data));
const float h_val[] = { 10, 20, 30};
float* OLDVAL = (float*)memcpy(new float[3], h_val, sizeof(h_val));
const float h_newval[] = { 7.0, 70.0, 7.0};
float* NEWVAL = (float*)memcpy(new float[3], h_newval, sizeof(h_newval));
float* d_oldval;
cudaMalloc((void**)&d_oldval, sizeof(sobj));
cudaMemcpy(d_oldval, h_val, sizeof(sobj), cudaMemcpyHostToDevice);
float* d_newval;
cudaMalloc((void**)&d_newval, sizeof(sobj));
cudaMemcpy(d_newval, h_newval, sizeof(sobj), cudaMemcpyHostToDevice);
float* d_oldvec;
cudaMalloc((void**)&d_oldvec, mem_size_old);
cudaMemcpy(d_oldvec, h_old, mem_size_old, cudaMemcpyHostToDevice);
float* d_newvec;
cudaMalloc((void**)&d_newvec, mem_size_new);
cudaMemcpy(d_newvec, h_new, mem_size_new, cudaMemcpyHostToDevice);
Replacement2 << <dim3(1,3), dim3(1024, 1) >> >(d_oldval, d_newval, 1024, d_oldvec, d_newvec);
cudaMemcpy(h_old, d_oldvec, mem_size_old, cudaMemcpyDeviceToHost);
for (int x1 = 0; x1 < 3; x1++){
for (int y1 = 0; y1 < 1024; y1++)
fprintf(fpout, "%f ", h_old[x1 * 1024 + y1]);
fprintf(fpout, "\n");
}
return 0;
}
```

but the replacement happens only in the first row. what is the problem??