The purpose of this page is to provide an easy-to-run demo with low computational requirements for the ideas proposed in the paper Evidential Deep Learning to Quantify Classification Uncertainty. Using MNIST dataset, I demonstrate how to create neural networks that are able to quantify classification uncertainty. The paper can be accesed over http://arxiv.org/abs/1806.01768
You can run this notebook in Colab using the colab icon below:
The notebook can also be downloaded using https://muratsensoy.github.io/uncertainty.ipynb
The following lines of codes demonstrate how softmax based Deep Neural Networks fail when they encounter out-of-sample queries.
# use this while running this notebook in Colab
%tensorflow_version 1.x
#import necessary libraries
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
import scipy.ndimage as nd
%matplotlib inline
import pylab as pl
from IPython import display
from tensorflow.examples.tutorials.mnist import input_data
# Download MNIST dataset
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
K= 10 # number of classes
digit_one = mnist.train.images[4].copy()
plt.imshow(digit_one.reshape(28,28))
plt.show()
# define some utility functions
def var(name, shape, init=None):
if init is None:
init = tf.truncated_normal_initializer(stddev=(2/shape[0])**0.5)
return tf.get_variable(name=name, shape=shape, dtype=tf.float32,
initializer=init)
def conv(Xin, f, strides=[1, 1, 1, 1], padding='SAME'):
return tf.nn.conv2d(Xin, f, strides, padding)
def max_pool(Xin, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME'):
return tf.nn.max_pool(Xin, ksize, strides, padding)
def rotate_img(x, deg):
import scipy.ndimage as nd
return nd.rotate(x.reshape(28,28),deg,reshape=False).ravel()
# Create a LeNet network with softmax cross entropy loss function
def LeNet_softmax(lmb=0.005):
g = tf.Graph()
with g.as_default():
X = tf.placeholder(shape=[None,28*28], dtype=tf.float32)
Y = tf.placeholder(shape=[None,10], dtype=tf.float32)
keep_prob = tf.placeholder(dtype=tf.float32)
# first hidden layer - conv
W1 = var('W1', [5,5,1,20])
b1 = var('b1', [20])
out1 = max_pool(tf.nn.relu(conv(tf.reshape(X, [-1, 28,28, 1]),
W1, strides=[1, 1, 1, 1]) + b1))
# second hidden layer - conv
W2 = var('W2', [5,5,20,50])
b2 = var('b2', [50])
out2 = max_pool(tf.nn.relu(conv(out1, W2, strides=[1, 1, 1, 1]) + b2))
# flatten the output
Xflat = tf.contrib.layers.flatten(out2)
# third hidden layer - fully connected
W3 = var('W3', [Xflat.get_shape()[1].value, 500])
b3 = var('b3', [500])
out3 = tf.nn.relu(tf.matmul(Xflat, W3) + b3)
out3 = tf.nn.dropout(out3, keep_prob=keep_prob)
#output layer
W4 = var('W4', [500,10])
b4 = var('b4',[10])
logits = tf.matmul(out3, W4) + b4
prob = tf.nn.softmax(logits=logits)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y))
l2_loss = (tf.nn.l2_loss(W3)+tf.nn.l2_loss(W4)) * lmb
step = tf.train.AdamOptimizer().minimize(loss + l2_loss)
# Calculate accuracy
pred = tf.argmax(logits, 1)
truth = tf.argmax(Y, 1)
acc = tf.reduce_mean(tf.cast(tf.equal(pred, truth), tf.float32))
return g, step, X, Y, keep_prob, prob, acc, loss
# get the LeNet network
g1, step1, X1, Y1, keep_prob1, prob1, acc1, loss1 = LeNet_softmax()
sess1 = tf.Session(graph=g1)
with g1.as_default():
sess1.run(tf.global_variables_initializer())
bsize = 1000 #batch size
n_batches = mnist.train.num_examples // bsize
for epoch in range(50):
for i in range(n_batches):
data, label = mnist.train.next_batch(bsize)
feed_dict={X1:data, Y1:label, keep_prob1:.5}
sess1.run(step1,feed_dict)
print('epoch %d - %d%%) '% (epoch+1, (100*(i+1))//n_batches), end='\r' if i<n_batches-1 else '')
train_acc = sess1.run(acc1, feed_dict={X1:mnist.train.images,Y1:mnist.train.labels,keep_prob1:1.})
test_acc = sess1.run(acc1, feed_dict={X1:mnist.test.images,Y1:mnist.test.labels,keep_prob1:1.})
print('training accuracy: %2.4f \t testing accuracy: %2.4f' % (train_acc, test_acc))
The test accuracy after 50 epochs is around 98.9%. Now, we want to classify a rotating digit from MNIST dataset to see how this network does for the samples that are not from the training set distribution. The following lines of codes helps us to see it.
# This method rotates an image counter-clockwise and classify it for different degress of rotation.
# It plots the highest classification probability along with the class label for each rotation degree.
def rotating_image_classification(img, sess, prob, X, keep_prob, uncertainty=None, threshold=0.5):
Mdeg = 180
Ndeg = int(Mdeg/10)+1
ldeg = []
lp = []
lu=[]
scores = np.zeros((1,K))
rimgs = np.zeros((28,28*Ndeg))
for i,deg in enumerate(np.linspace(0,Mdeg, Ndeg)):
nimg = rotate_img(img,deg).reshape(28,28)
nimg = np.clip(a=nimg,a_min=0,a_max=1)
rimgs[:,i*28:(i+1)*28] = nimg
feed_dict={X:nimg.reshape(1,-1), keep_prob:1.0}
if uncertainty is None:
p_pred_t = sess.run(prob, feed_dict=feed_dict)
else:
p_pred_t,u = sess.run([prob,uncertainty], feed_dict=feed_dict)
lu.append(u.mean())
scores += p_pred_t >= threshold
ldeg.append(deg)
lp.append(p_pred_t[0])
labels = np.arange(10)[scores[0].astype(bool)]
lp = np.array(lp)[:,labels]
c = ['black','blue','red','brown','purple','cyan']
marker = ['s','^','o']*2
labels = labels.tolist()
for i in range(len(labels)):
plt.plot(ldeg,lp[:,i],marker=marker[i],c=c[i])
if uncertainty is not None:
labels += ['uncertainty']
plt.plot(ldeg,lu,marker='<',c='red')
plt.legend(labels)
plt.xlim([0,Mdeg])
plt.xlabel('Rotation Degree')
plt.ylabel('Classification Probability')
plt.show()
plt.figure(figsize=[6.2,100])
plt.imshow(1-rimgs,cmap='gray')
plt.axis('off')
plt.show()
rotating_image_classification(digit_one, sess1, prob1, X1, keep_prob1)
As shown above, a neural network trained to generate softmax probabilities fails significantly when it encounters a sample that is different from the training examples. The softmax forces neural network to pick one class, even though the object belongs to an unknown category. This is demonstrated when we rotate the digit one between 60 and 130 degrees.
In the following sections, we train the same neural network using the loss functions introduced in the paper.
As described in the paper, a neural network can be trained to learn parameters of a Dirichlet distribution, instead of softmax probabilities. Dirichlet distributions with parameters $\alpha \geq 1$ behaves like a generative model for softmax probabilities (categorical distributions). It associates a likelihood value with each categorical distribution.
# This function to generate evidence is used for the first example
def relu_evidence(logits):
return tf.nn.relu(logits)
# This one usually works better and used for the second and third examples
# For general settings and different datasets, you may try this one first
def exp_evidence(logits):
return tf.exp(tf.clip_by_value(logits,-10,10))
# This one is another alternative and
# usually behaves better than the relu_evidence
def softplus_evidence(logits):
return tf.nn.softplus(logits)
def KL(alpha):
beta=tf.constant(np.ones((1,K)),dtype=tf.float32)
S_alpha = tf.reduce_sum(alpha,axis=1,keep_dims=True)
S_beta = tf.reduce_sum(beta,axis=1,keep_dims=True)
lnB = tf.lgamma(S_alpha) - tf.reduce_sum(tf.lgamma(alpha),axis=1,keep_dims=True)
lnB_uni = tf.reduce_sum(tf.lgamma(beta),axis=1,keep_dims=True) - tf.lgamma(S_beta)
dg0 = tf.digamma(S_alpha)
dg1 = tf.digamma(alpha)
kl = tf.reduce_sum((alpha - beta)*(dg1-dg0),axis=1,keep_dims=True) + lnB + lnB_uni
return kl
def mse_loss(p, alpha, global_step, annealing_step):
S = tf.reduce_sum(alpha, axis=1, keep_dims=True)
E = alpha - 1
m = alpha / S
A = tf.reduce_sum((p-m)**2, axis=1, keep_dims=True)
B = tf.reduce_sum(alpha*(S-alpha)/(S*S*(S+1)), axis=1, keep_dims=True)
annealing_coef = tf.minimum(1.0,tf.cast(global_step/annealing_step,tf.float32))
alp = E*(1-p) + 1
C = annealing_coef * KL(alp)
return (A + B) + C
# train LeNet network with expected mean square error loss
def LeNet_EDL(logits2evidence=relu_evidence,loss_function=mse_loss, lmb=0.005):
g = tf.Graph()
with g.as_default():
X = tf.placeholder(shape=[None,28*28], dtype=tf.float32)
Y = tf.placeholder(shape=[None,10], dtype=tf.float32)
keep_prob = tf.placeholder(dtype=tf.float32)
global_step = tf.Variable(initial_value=0, name='global_step', trainable=False)
annealing_step = tf.placeholder(dtype=tf.int32)
# first hidden layer - conv
W1 = var('W1', [5,5,1,20])
b1 = var('b1', [20])
out1 = max_pool(tf.nn.relu(conv(tf.reshape(X, [-1, 28,28, 1]),
W1, strides=[1, 1, 1, 1]) + b1))
# second hidden layer - conv
W2 = var('W2', [5,5,20,50])
b2 = var('b2', [50])
out2 = max_pool(tf.nn.relu(conv(out1, W2, strides=[1, 1, 1, 1]) + b2))
# flatten the output
Xflat = tf.contrib.layers.flatten(out2)
# third hidden layer - fully connected
W3 = var('W3', [Xflat.get_shape()[1].value, 500])
b3 = var('b3', [500])
out3 = tf.nn.relu(tf.matmul(Xflat, W3) + b3)
out3 = tf.nn.dropout(out3, keep_prob=keep_prob)
#output layer
W4 = var('W4', [500,10])
b4 = var('b4',[10])
logits = tf.matmul(out3, W4) + b4
evidence = logits2evidence(logits)
alpha = evidence + 1
u = K / tf.reduce_sum(alpha, axis=1, keep_dims=True) #uncertainty
prob = alpha/tf.reduce_sum(alpha, 1, keepdims=True)
loss = tf.reduce_mean(loss_function(Y, alpha, global_step, annealing_step))
l2_loss = (tf.nn.l2_loss(W3)+tf.nn.l2_loss(W4)) * lmb
step = tf.train.AdamOptimizer().minimize(loss + l2_loss, global_step=global_step)
# Calculate accuracy
pred = tf.argmax(logits, 1)
truth = tf.argmax(Y, 1)
match = tf.reshape(tf.cast(tf.equal(pred, truth), tf.float32),(-1,1))
acc = tf.reduce_mean(match)
total_evidence = tf.reduce_sum(evidence,1, keepdims=True)
mean_ev = tf.reduce_mean(total_evidence)
mean_ev_succ = tf.reduce_sum(tf.reduce_sum(evidence,1, keepdims=True)*match) / tf.reduce_sum(match+1e-20)
mean_ev_fail = tf.reduce_sum(tf.reduce_sum(evidence,1, keepdims=True)*(1-match)) / (tf.reduce_sum(tf.abs(1-match))+1e-20)
return g, step, X, Y, annealing_step, keep_prob, prob, acc, loss, u, evidence, mean_ev, mean_ev_succ, mean_ev_fail
g2, step2, X2, Y2, annealing_step, keep_prob2, prob2, acc2, loss2, u, evidence, \
mean_ev, mean_ev_succ, mean_ev_fail= LeNet_EDL()
sess2 = tf.Session(graph=g2)
with g2.as_default():
sess2.run(tf.global_variables_initializer())
bsize = 1000 #batch size
n_batches = mnist.train.num_examples // bsize
L_train_acc1=[]
L_train_ev_s=[]
L_train_ev_f=[]
L_test_acc1=[]
L_test_ev_s=[]
L_test_ev_f=[]
for epoch in range(50):
for i in range(n_batches):
data, label = mnist.train.next_batch(bsize)
feed_dict={X2:data, Y2:label, keep_prob2:.5, annealing_step:10*n_batches}
sess2.run(step2,feed_dict)
print('epoch %d - %d%%) '% (epoch+1, (100*(i+1))//n_batches), end='\r' if i<n_batches-1 else '')
train_acc, train_succ, train_fail = sess2.run([acc2,mean_ev_succ,mean_ev_fail], feed_dict={X2:mnist.train.images,Y2:mnist.train.labels,keep_prob2:1.})
test_acc, test_succ, test_fail = sess2.run([acc2,mean_ev_succ,mean_ev_fail], feed_dict={X2:mnist.test.images,Y2:mnist.test.labels,keep_prob2:1.})
L_train_acc1.append(train_acc)
L_train_ev_s.append(train_succ)
L_train_ev_f.append(train_fail)
L_test_acc1.append(test_acc)
L_test_ev_s.append(test_succ)
L_test_ev_f.append(test_fail)
print('training: %2.4f (%2.4f - %2.4f) \t testing: %2.4f (%2.4f - %2.4f)' %
(train_acc, train_succ, train_fail, test_acc, test_succ, test_fail))
The following function plots average total evidence and prediction uncertainty in addition to accuracy for the training and test sets. Let us note that uncertainty approaches to 1.0 as the total evidence approaches to 0.
def draw_EDL_results(train_acc1, train_ev_s, train_ev_f, test_acc1, test_ev_s, test_ev_f):
# calculate uncertainty for training and testing data for correctly and misclassified samples
train_u_succ = K / (K+np.array(train_ev_s))
train_u_fail = K / (K+np.array(train_ev_f))
test_u_succ = K / (K+np.array(test_ev_s))
test_u_fail = K / (K+np.array(test_ev_f))
f, axs = pl.subplots(2, 2)
f.set_size_inches([10,10])
axs[0,0].plot(train_ev_s,c='r',marker='+')
axs[0,0].plot(train_ev_f,c='k',marker='x')
axs[0,0].set_title('Train Data')
axs[0,0].set_xlabel('Epoch')
axs[0,0].set_ylabel('Estimated total evidence for classification')
axs[0,0].legend(['Correct Clasifications','Misclasifications'])
axs[0,1].plot(train_u_succ,c='r',marker='+')
axs[0,1].plot(train_u_fail,c='k',marker='x')
axs[0,1].plot(train_acc1,c='blue',marker='*')
axs[0,1].set_title('Train Data')
axs[0,1].set_xlabel('Epoch')
axs[0,1].set_ylabel('Estimated uncertainty for classification')
axs[0,1].legend(['Correct clasifications','Misclasifications', 'Accuracy'])
axs[1,0].plot(test_ev_s,c='r',marker='+')
axs[1,0].plot(test_ev_f,c='k',marker='x')
axs[1,0].set_title('Test Data')
axs[1,0].set_xlabel('Epoch')
axs[1,0].set_ylabel('Estimated total evidence for classification')
axs[1,0].legend(['Correct Clasifications','Misclasifications'])
axs[1,1].plot(test_u_succ,c='r',marker='+')
axs[1,1].plot(test_u_fail,c='k',marker='x')
axs[1,1].plot(test_acc1,c='blue',marker='*')
axs[1,1].set_title('Test Data')
axs[1,1].set_xlabel('Epoch')
axs[1,1].set_ylabel('Estimated uncertainty for classification')
axs[1,1].legend(['Correct clasifications','Misclasifications', 'Accuracy'])
plt.show()
draw_EDL_results(L_train_acc1, L_train_ev_s, L_train_ev_f, L_test_acc1, L_test_ev_s, L_test_ev_f)
The figure above indicates that the proposed approach generates much smaller amount of evidence for the misclassified samples than the correctly classified ones. The uncertainty of the misclassified samples are around 0.8, while it is around 0.1 for the correctly classified ones, both for training and testing sets. This means that the neural network is very uncertain for the misclassified samples and provides certain predictions only for the correctly classified ones. In other words, the neural network also predicts when it fails by assigning high uncertainty to its wrong predictions.
rotating_image_classification(digit_one, sess2, prob2, X2, keep_prob2, u)
In this section, we train neural network using the loss function described in Eq. 4 in the paper. This loss function is derived using the expected value of the cross entropy loss over the predicted Dirichlet distribution.
def loss_EDL(func=tf.digamma):
def loss_func(p, alpha, global_step, annealing_step):
S = tf.reduce_sum(alpha, axis=1, keep_dims=True)
E = alpha - 1
A = tf.reduce_sum(p * (func(S) - func(alpha)), axis=1, keepdims=True)
annealing_coef = tf.minimum(1.0, tf.cast(global_step/annealing_step,tf.float32))
alp = E*(1-p) + 1
B = annealing_coef * KL(alp)
return (A + B)
return loss_func
g3, step3, X3, Y3, annealing_step3, keep_prob3, prob3, acc3, loss3, u3, evidence3, \
mean_ev3, mean_ev_succ3, mean_ev_fail3 = LeNet_EDL(exp_evidence, loss_EDL(tf.digamma), lmb=0.001)
sess3 = tf.Session(graph=g3)
with g3.as_default():
sess3.run(tf.global_variables_initializer())
bsize = 1000 #batch size
n_batches = mnist.train.num_examples // bsize
L3_train_acc1=[]
L3_train_ev_s=[]
L3_train_ev_f=[]
L3_test_acc1=[]
L3_test_ev_s=[]
L3_test_ev_f=[]
for epoch in range(50):
for i in range(n_batches):
data, label = mnist.train.next_batch(bsize)
feed_dict={X3:data, Y3:label, keep_prob3:.5, annealing_step3:10*n_batches}
sess3.run(step3,feed_dict)
print('epoch %d - %d%%) '% (epoch+1, (100*(i+1))//n_batches), end='\r' if i<n_batches-1 else '')
train_acc, train_succ, train_fail = sess3.run([acc3,mean_ev_succ3,mean_ev_fail3], feed_dict={X3:mnist.train.images,Y3:mnist.train.labels,keep_prob3:1.})
test_acc, test_succ, test_fail = sess3.run([acc3,mean_ev_succ3,mean_ev_fail3], feed_dict={X3:mnist.test.images,Y3:mnist.test.labels,keep_prob3:1.})
L3_train_acc1.append(train_acc)
L3_train_ev_s.append(train_succ)
L3_train_ev_f.append(train_fail)
L3_test_acc1.append(test_acc)
L3_test_ev_s.append(test_succ)
L3_test_ev_f.append(test_fail)
print('training: %2.4f (%2.4f - %2.4f) \t testing: %2.4f (%2.4f - %2.4f)' %
(train_acc, train_succ, train_fail, test_acc, test_succ, test_fail))
draw_EDL_results(L3_train_acc1, L3_train_ev_s, L3_train_ev_f, L3_test_acc1, L3_test_ev_s, L3_test_ev_f)
The figure above indicates that the neural network generates much more evidence for the correctly classified samples. As a result, it has a very low uncertainty (around zero) for the correctly classified samples, while the uncertainty is very high (around 0.7) for the misclassified samples.
rotating_image_classification(digit_one, sess3, prob3, X3, keep_prob3, u3)
In this section, we repeat our experiments using the loss function based on Eq. 3 in the paper.
g4, step4, X4, Y4, annealing_step4, keep_prob4, prob4, acc4, loss4, u4, evidence4, \
mean_ev4, mean_ev_succ4, mean_ev_fail4 = LeNet_EDL(exp_evidence, loss_EDL(tf.log), lmb=0.001)
sess4 = tf.Session(graph=g4)
with g4.as_default():
sess4.run(tf.global_variables_initializer())
bsize = 1000 #batch size
n_batches = mnist.train.num_examples // bsize
L4_train_acc1=[]
L4_train_ev_s=[]
L4_train_ev_f=[]
L4_test_acc1=[]
L4_test_ev_s=[]
L4_test_ev_f=[]
for epoch in range(50):
for i in range(n_batches):
data, label = mnist.train.next_batch(bsize)
feed_dict={X4:data, Y4:label, keep_prob4:.5, annealing_step4:10*n_batches}
sess4.run(step4,feed_dict)
print('epoch %d - %d%%) '% (epoch+1, (100*(i+1))//n_batches), end='\r' if i<n_batches-1 else '')
train_acc, train_succ, train_fail = sess4.run([acc4,mean_ev_succ4,mean_ev_fail4], feed_dict={X4:mnist.train.images,Y4:mnist.train.labels,keep_prob4:1.})
test_acc, test_succ, test_fail = sess4.run([acc4,mean_ev_succ4,mean_ev_fail4], feed_dict={X4:mnist.test.images,Y4:mnist.test.labels,keep_prob4:1.})
L4_train_acc1.append(train_acc)
L4_train_ev_s.append(train_succ)
L4_train_ev_f.append(train_fail)
L4_test_acc1.append(test_acc)
L4_test_ev_s.append(test_succ)
L4_test_ev_f.append(test_fail)
print('training: %2.4f (%2.4f - %2.4f) \t testing: %2.4f (%2.4f - %2.4f)' %
(train_acc, train_succ, train_fail, test_acc, test_succ, test_fail))
draw_EDL_results(L4_train_acc1, L4_train_ev_s, L4_train_ev_f, L4_test_acc1, L4_test_ev_s, L4_test_ev_f)
rotating_image_classification(digit_one, sess4, prob4, X4, keep_prob4, u4)
Consider the case that we mix two digits from the MNIST dataset and query a classifier trained on MNIST dataset to classify it. For example, the following image is created by overlaying digit 0 with digit 6. The resulting image have similarities to both digits but neither 0 nor 6.
im0 = mnist.test.images[10]
im6 = mnist.test.images[21]
img = im0 + im6
img /= img.max()
plt.subplot(1,3,1)
plt.imshow(im0.reshape(28,28))
plt.subplot(1,3,2)
plt.imshow(im6.reshape(28,28))
plt.subplot(1,3,3)
plt.imshow(img.reshape(28,28))
plt.show()
The neural network trained with softmax cross entropy loss has the following prediction for the classification of this image, where the image is classifed as 0 with probability 0.9.
p1 = sess1.run(prob1, feed_dict={X1:img[None,:], keep_prob1:1.0})
print('softmax prob: ', np.round(p1[0], decimals=3))
When we do the same experiments on the neural net trained using the loss function in Eq. 7, we have a much different results. The neural network could not generate any evidence to classify the image into one of 10 digits. Hence, it provides uniform distribution as its prediction. It implies I do not know by providing maximum uncertainty.
uncertainty2, p2 = sess2.run([u, prob2], feed_dict={X2:img[None,:], keep_prob2:1.0})
print('uncertainty:', np.round(uncertainty2[0,0], decimals=2))
print('Dirichlet mean: ', np.round(p2[0], decimals=3))
When we use the loss function in Eq. 5, the exepcted probability is highest for digit 0. It is around 0.32, however, the associated uncertainty is quite high around 0.73 as shown below.
uncertainty3, p3 = sess3.run([u3, prob3], feed_dict={X3:img[None,:], keep_prob3:1.0})
print('uncertainty:', np.round(uncertainty3[0,0], decimals=2))
print('Dirichlet mean: ', np.round(p3[0], decimals=3))
The uncertainty increase to 0.85 while the expected probability for the digit 0 decreases to 0.184 when the loss function in Eq. 6 is used.
uncertainty4, p4 = sess4.run([u4, prob4], feed_dict={X4:img[None,:], keep_prob4:1.0})
print('uncertainty:', np.round(uncertainty4[0,0], decimals=2))
print('Dirichlet mean: ', np.round(p4[0], decimals=3))
Lets try another settings where each of these two digits can be recognizable easily. You can see below an image which is created by combining images for digit 0 and digit 6 without any overlap.
img = np.zeros((28,28))
img[:,:-6] += mnist.test.images[10].reshape(28,28)[:,6:]
img[:,14:] += mnist.test.images[21].reshape(28,28)[:,5:19]
img /= img.max()
plt.imshow(img)
plt.show()
Below, you can see the prediction of the neural network trained with softmax cross entropy for this example. The prediction of the network is digit 2 with probability 0.775. Hence, the network associates quite high probability with the wrong label.
p1 = sess1.run(prob1, feed_dict={X1:img.reshape(1,-1), keep_prob1:1.0})
print('softmax prob: ', np.round(p1[0], decimals=3))
On the otherhand, when we do the same using the network trained based on the loss in Eq. 7, the output of the neural network is uniform distribution with uncertainty 1.0, as shown below.
uncertainty2, p2 = sess2.run([u, prob2], feed_dict={X2:img.reshape(1,-1), keep_prob2:1.0})
print('uncertainty:', np.round(uncertainty2[0,0], decimals=2))
print('Dirichlet mean: ', np.round(p2[0], decimals=3))
The neural networks, trained using the loss functions defined in Eq. 5 and Eq. 6 in the paper, also have very high uncertainty for their predictions. These networks assing small amount of evidence for the classification of the image as digit 2. However, they associate very high uncertainty with their misclassifications of the image.
uncertainty3, p3 = sess3.run([u3, prob3], feed_dict={X3:img.reshape(1,-1), keep_prob3:1.0})
print('uncertainty:', np.round(uncertainty3[0,0], decimals=2))
print('Dirichlet mean: ', np.round(p3[0], decimals=3))
uncertainty4, p4 = sess4.run([u4, prob4], feed_dict={X4:img.reshape(1,-1), keep_prob4:1.0})
print('uncertainty:', np.round(uncertainty4[0,0], decimals=2))
print('Dirichlet mean: ', np.round(p4[0], decimals=3))