I want to do something about matrix calculation in cuda. I use malloc in the kernel and make sure it is right and successfully malloc memory for the thread.The code runs pretty good when there is 10000 threads but when I add to 20000 it needs very long time to finished. Any one can help me, thanks very much (I first multipled transposed X and X, and concat an identity matrix to form the original matrix)

following is my kernel function:

**global** void train_model(double *x,int *d_feature_number,int *d_train_number,int * model_number) {

int index=threadIdx.x + blockIdx.x * blockDim.x;

if (index<*model_number) {

//printf("%d",index);

int feature_number = *d_feature_number;//*d_feature_number;

int train_example_number = *d_train_number;//*d_train_number;

```
double **original = (double **) malloc(feature_number * sizeof(double *));
double *data = (double *) malloc(feature_number * 2*feature_number * sizeof(double));
for (int r = 0; r < feature_number; r++) {
original[r] = data + r * 2*feature_number;
}
double sum=0;
// double original[feature_number][2*feature_number];
for (int i = 0; i < feature_number; i++) {
for (int j = i; j < 2 * feature_number; j++) {
if (j < feature_number) {
sum = 0;
for (int k = 0; k < train_example_number; k++) {
sum += x[i + feature_number * k] * x[j + feature_number * k];
}
original[i][j] = sum;
original[j][i] = sum;
} else {
if (i == j % feature_number)
original[i][j] = 1;
else
original[i][j] = 0;
}
}
}
```

}