I am new to CUDA C/C++.

In a nutshell, I am trying to parallelize the computation of the Jaro-Winkler string distance of all pairs of strings between two vectors in CUDA.

I have implemented the Jaro-Winkler distance in C++, and it works perfectly well when I run it on the host.

```
%%cu
#include <iostream>
float jaro_winkler(const char *str1, const int len1, const char *str2, const int len2) {
// Description: This functionn computes the Jaro-Winkler distance between two strings
//
// Inputs:
// - str1: first string
// - len1: first string's length
// - str2: second string
// - len2: second string's length
//
// Output:
// - dist: Jaro-Winkler distance between str1 and str2
// If both strings are identical, tthe Jaro-Winkler distance between them is 1
if (str1 == str2) {
return 1.0;
} else {
// If either string is null, the Jaro-Winkler distance between str1 and str2 is 0
if (len1 == 0 || len2 == 0) {
return 0.0;
} else {
// We compute the number of matching characters between str1 and str2
// We consider the characters floor(max(len1, len2) / 2) - 1 away from each other
int max_dist = floor(max(len1, len2) / 2) - 1;
int match = 0;
int hash_s1[50] = { 0 }, hash_s2[50] = { 0 };
for (int i = 0; i < len1; i++) {
for (int j = max(0, i - max_dist); j < min(len2, i + max_dist) + 1; j++) {
if (str1[i] == str2[j] && hash_s2[j] == 0) {
// Two characters are matching if they appear in both strings at most max_dist characters away from each other
hash_s1[i] = 1;
hash_s2[j] = 1;
match++;
break;
}
}
}
// If there is no matching characters between both strings, the Jaro-Winkler distance between them is 0
if (match == 0) {
return 0.0;
} else {
// If a positive number of matching characters is found, we need to compute the number of transpositions,
// that is, the number of matching characters that are not in the right order divided by two
float t = 0;
int point = 0;
for (int i = 0; i < len1; i++) {
if (hash_s1[i] == 1) {
while (hash_s2[point] == 0) {
point++;
}
if (str1[i] != str2[point]) {
t++;
}
point++;
}
}
t /= 2;
// The Jaro distance between str1 and str2 is defined as follows:
float dist;
dist = (((float)match / (float)len1) + ((float)match / (float)len2) + (((float)match - t) / (float)match)) / 3.0;
// To go from the Jaro distance to the Jaro-Winkler distance, we need compute the length of
// the common prefix between both strings
int prefix = 0;
for (int i = 0; i < min(len1, len2); i++) {
if (str1[i] == str2[i]) {
prefix++;
} else {
break;
}
}
prefix = min(4, prefix);
// To obtain the Jaro-Winkler distance between str1 and str2, we need to adjust the Jaro distance for
// the length of the common prefix between both strings
dist += 0.1 * prefix * (1 - dist);
return dist;
}
}
}
}
int main() {
float dist = jaro_winkler("TRACE", 5, "TROUBLE", 7);
std::cout << dist;
return 0;
}
```

However, once I move it to the device (taking care of modifying mathematical functions), it does not work. No matter what I do, the function does not seem to do anything.

```
%%cu
#include <iostream>
__device__ float jaro_winkler(const char *str1, const int len1, const char *str2, const int len2) {
// Description: This functionn computes the Jaro-Winkler distance between two strings
//
// Inputs:
// - str1: first string
// - len1: first string's length
// - str2: second string
// - len2: second string's length
//
// Output:
// - dist: Jaro-Winkler distance between str1 and str2
// If both strings are identical, tthe Jaro-Winkler distance between them is 1
if (str1 == str2) {
return 1.0;
} else {
// If either string is null, the Jaro-Winkler distance between str1 and str2 is 0
if (len1 == 0 || len2 == 0) {
return 0.0;
} else {
// We compute the number of matching characters between str1 and str2
// We consider the characters floor(max(len1, len2) / 2) - 1 away from each other
int max_dist = floorf(max(len1, len2) / 2) - 1;
int match = 0;
int hash_s1[50] = { 0 }, hash_s2[50] = { 0 };
for (int i = 0; i < len1; i++) {
for (int j = max(0, i - max_dist); j < min(len2, i + max_dist) + 1; j++) {
if (str1[i] == str2[j] && hash_s2[j] == 0) {
// Two characters are matching if they appear in both strings at most max_dist characters away from each other
hash_s1[i] = 1;
hash_s2[j] = 1;
match++;
break;
}
}
}
// If there is no matching characters between both strings, the Jaro-Winkler distance between them is 0
if (match == 0) {
return 0.0;
} else {
// If a positive number of matching characters is found, we need to compute the number of transpositions,
// that is, the number of matching characters that are not in the right order divided by two
float t = 0;
int point = 0;
for (int i = 0; i < len1; i++) {
if (hash_s1[i] == 1) {
while (hash_s2[point] == 0) {
point++;
}
if (str1[i] != str2[point]) {
t++;
}
point++;
}
}
t /= 2;
// The Jaro distance between str1 and str2 is defined as follows:
float dist;
dist = (((float)match / (float)len1) + ((float)match / (float)len2) + (((float)match - t) / (float)match)) / 3.0;
// To go from the Jaro distance to the Jaro-Winkler distance, we need compute the length of
// the common prefix between both strings
int prefix = 0;
for (int i = 0; i < min(len1, len2); i++) {
if (str1[i] == str2[i]) {
prefix++;
} else {
break;
}
}
prefix = min(4, prefix);
// To obtain the Jaro-Winkler distance between str1 and str2, we need to adjust the Jaro distance for
// the length of the common prefix between both strings
dist += 0.1 * prefix * (1 - dist);
return dist;
}
}
}
}
__global__ void jaro_winkler_kernel(const char *str1[], const int len1[], int n1, const char *str2[], const int len2[], int n2, float *output) {
const int x = threadIdx.x + blockDim.x * blockIdx.x;
const int y = threadIdx.y + blockDim.y * blockIdx.y;
// We compute the Jaro-Winkler distance between str1[x] and str2[y] and return it to a flattened vector
output[x * n2 + y] = jaro_winkler(str1[x], len1[x], str2[y], len2[y]);
}
int main(void) {
float output[4];
float *d_output;
cudaMalloc((void**)&d_output, 4 * sizeof(float));
const char *string[] = {"TRACE", "TROUBLE"};
const int len[] = {5, 7};
dim3 dimGrid(2, 2);
jaro_winkler_kernel<<<1, dimGrid>>>(string, len, 2, string, len, 2, d_output);
cudaMemcpy(&output, d_output, 4 * sizeof(float), cudaMemcpyDeviceToHost);
std::cout << output[1];
}
```

I would be grateful for any advice or guidance on this problem.