cudnn gru has bug

Hi guys, I think cudnn gru path has bug, because following code got different result, please check it, thanks.

import numpy as np
import tensorflow as tf
from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops

S_MIN=-40
S_MAX=13
EXP_MAX=40

def native_gru(input, w):

seq_len, batch_size, hidden_size=input.shape

offset = 0
wu = w[offset:offset + hidden_size*hidden_size].reshape(
        (hidden_size,hidden_size)).transpose()
offset += hidden_size*hidden_size
wr = w[offset:offset + hidden_size*hidden_size].reshape(
        (hidden_size,hidden_size)).transpose()
offset += hidden_size*hidden_size
wc = w[offset:offset + hidden_size*hidden_size].reshape(
        (hidden_size,hidden_size)).transpose()
offset += hidden_size*hidden_size


ru = w[offset:offset + hidden_size*hidden_size].reshape(
        (hidden_size,hidden_size)).transpose()
offset += hidden_size*hidden_size
rr = w[offset:offset + hidden_size*hidden_size].reshape(
        (hidden_size,hidden_size)).transpose()
offset += hidden_size*hidden_size
rc = w[offset:offset + hidden_size*hidden_size].reshape(
        (hidden_size,hidden_size)).transpose()
offset += hidden_size*hidden_size



bx_u = w[offset:offset + hidden_size]
offset += hidden_size
bx_r = w[offset:offset + hidden_size]
offset += hidden_size
bx_c = w[offset:offset + hidden_size]
offset += hidden_size


bh_u = w[offset:offset + hidden_size]
offset += hidden_size
bh_r = w[offset:offset + hidden_size]
offset += hidden_size
bh_c = w[offset:offset + hidden_size]
offset += hidden_size

def sigmod(x):
        y = np.copy(x)
        y[x < S_MIN] = S_MIN
        y[x>S_MAX] = S_MAX
        return 1./(1. + np.exp(-y))

def tanh(x):
        y=-2.*x
        y[y>EXP_MAX] = EXP_MAX
        return (2./(1.+np.exp(y))) - 1

output = []
pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype)

for i in range(seq_len):
        emb_1 = input[i]
        update_gate = sigmod(np.matmul(emb_1, wu) + np.matmul(pre_h, ru)+bx_u+bh_u)
        reset_gate = sigmod(np.matmul(emb_1, wr)+np.matmul(pre_h, rr)+bx_r+bh_r)
        h_t_temp = tanh(np.matmul(emb_1, wc)+reset_gate*(np.matmul(pre_h,rc)+bh_c)+bx_c)
        new_h = update_gate*pre_h + (1-update_gate)*h_t_temp
        pre_h = new_h
        output.append(new_h)

output = np.concatenate(output, -1)
output = output.reshape((batch_size, -1, hidden_size))
output = output.transpose((1,0,2))
return output

def TestGRU():
num_steps = 2
batch_size = 1
hidden_size = 1
input_w_size = 6hidden_sizehidden_size + 6*hidden_size
x = np.random.uniform(low=-0.1, high=0.1, size=(num_steps, batch_size, hidden_size)).astype(np.float32)
#x = np.ones((num_steps, batch_size, hidden_size)).astype(np.float32)

flat_w = np.random.uniform(low=-0.1, high=0.1, size=(input_w_size)).astype(np.float32)
#flat_w = np.ones((input_w_size)).astype(np.float32)
out = native_gru(x, flat_w)

n_layer = 1
rnn_cudnn = cudnn_rnn_ops.CudnnGRU(n_layer, hidden_size, hidden_size, 'linear_input')
#param_cudnn = tf.Variable(tf.ones([rnn_cudnn.params_size()]), validate_shape=False)
param_cudnn = tf.Variable(flat_w)

y_cudnn, state_cudnn = rnn_cudnn(x,#tf.transpose(x, [1,0,2]),
tf.zeros([n_layer, batch_size, hidden_size]), param_cudnn)
print("native gru:"+str(out))
with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print("cudnn gru:" +str(sess.run([y_cudnn])))

if name == ‘main’:
TestGRU()