CNN - visualization of maximum filter activation using Tensorflow (using MNIST)

Question

CNN - visualization of maximum filter activation using Tensorflow (using MNIST)

I am currently working on making renderings for the maximum input image given the kernel / filters generated with a convolutional neural network.

Keras had a blog post here that does something similar, but the results were dubious at best when using anything other than the supplied dataset, so I thought I could try with Tensorflow directly. [I will try to edit my post later with images from it not available on this computer].

Using the MNIST dataset along with the Tensorflow tutorial and Keras blog post as a reference, I generated the following code while trying to create the said visualizations. I'm not sure if my methodology is correct, especially with how / when to normalize my results in order to visualize them.

import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import copy
from scipy.misc import imsave


#~~~~~~~~~~~~~~~~~~~~~~~~~ CNN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Most of the CNN section directly from the tutorial
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
img_width = 28
img_height = 28
n = 3
remove_negatives = False
normalize = True
use = 'layer'


def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return(tf.Variable(initial))

def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return(tf.Variable(initial))

def conv2d(x, W):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')

x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])

W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])

x_image = tf.reshape(x, [-1,28,28,1])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])

y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)

cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[1]))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


with tf.Session() as sess:
   sess.run(tf.global_variables_initializer())
   for i in range(5000):
     batch = mnist.train.next_batch(50)
     if i%100 == 0:
       train_accuracy = accuracy.eval(feed_dict={x:batch[0], y_: batch[1], keep_prob: 1.0})
       print("step %d, training accuracy %g"%(i, train_accuracy))
     train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
   layer = sess.run(W_conv1[:,:,:,:])
   bias = sess.run(b_conv1)
   layer2 = sess.run(W_conv2[:,:,:,:])
   bias2 = sess.run(b_conv2)





#~~~~~~~~~~~~~~~ Begin Visualization Code ~~~~~~~~~~~~~~~~
kept_filters = []
layer_use = layer
bias_use = bias
k=1
#toggle between layer 1 and layer 2 based on variable defined at beginning
if use != 'layer': 
   k = np.shape(layer2[:,:,:,:])[2]
   layer_use = layer2
   bias_use = bias2

#loop through kernels/feature maps and maximize each one input image
for fmap in range(len(layer[0,0,0,:])):
   feat_map = fmap
   #randomized white-noise input image that will be max'ed
   noise_mat = weight_variable([1,28,28,k])
   #load kernel as a constant
   single_layer = tf.constant(layer_use[:,:,0:k,feat_map-1:feat_map] + bias_use[feat_map],dtype=tf.float32)
   conv = conv2d(noise_mat,single_layer)
   #Use mean of the image matrix as the "loss" - is this the proper way to do this?
   loss = -tf.reduce_mean(conv)
   train_step = tf.train.GradientDescentOptimizer(.5).minimize(loss,var_list=[noise_mat])

   #the training/maximizing     
   with tf.Session() as sess:
      sess.run(tf.global_variables_initializer())
      updatelist = [np.sum(sess.run(noise_mat)[0,:,:,0])]
      noise_mat_begin = sess.run(noise_mat[0,:,:,0])
      conv_saved = sess.run(conv)
      for __ in range(5000):
         train_step.run()
         if __%200 == 0:
            updatelist = updatelist + [np.sum(sess.run(noise_mat)[0,:,:,0])]
      noise_mat_end = sess.run(noise_mat)[0,:,:,0]     
   noise_mat_normed = copy.deepcopy(noise_mat_end)

   #not sure the best way to normalize?
   if remove_negatives:
      noise_mat_normed[noise_mat_normed <= 0] = 0   
   if normalize:
      std = np.std(noise_mat_normed)
      mean = np.mean(noise_mat_normed)
      def full_norm(val):
         return((val - mean)/std)
      vnew = np.vectorize(full_norm)
      noise_mat_normed = vnew(noise_mat_normed)
   else:
      oldmax = np.max(noise_mat_normed)
      oldmin = np.min(noise_mat_normed)
      def new_range(val,OldMax,OldMin):
         return((((val - OldMin) * 255) / (OldMax - OldMin)))
      vnew = np.vectorize(new_range)
      noise_mat_normed = vnew(noise_mat_normed,oldmax,oldmin)

   #negative sums generally imply a lack of convergence due to my loss metric, so remove them   
   if np.sum(noise_mat_normed) > 0:
      kept_filters += [noise_mat_normed]


#visualize results in a grid format, similar to the blog post
kept_filters = kept_filters[:n * n]
margin = 5
width = n * img_width + (n - 1) * margin
height = n * img_height + (n - 1) * margin
stitched_filters = np.zeros((width, height))

for i in range(n):
    for j in range(n):
        img = kept_filters[i * n + j]
        stitched_filters[(img_width + margin) * i: (img_width + margin) * i + img_width,
                         (img_height + margin) * j: (img_height + margin) * j + img_height] = img

imsave('TF_vis_%dx%d.png' % (n, n), stitched_filters)

This produces results like this (from convolutional layer 1):

I'm not sure if this is correct at all, especially since level 2 does not seem completely different. Are my results and / or methodology reasonable? Has anyone else done this using the MNIST dataset? Aside, the verification accuracy was> 95%.

EDIT : I must have been doing something wrong; I have re-executed / rerun the code from the blog post and now the results from my own Tensorflow code look about the same as the output of the blog post method, so that's fine. However, the main problems remain:

Why am I not getting more obvious or distinct results? I know they won't be as specific as the filters themselves, but these images don't seem to depict anything, unlike similar blog posts. Not enough changes in the original dataset?

Shouldn't I be getting at least SOME things that aren't just glorified by edge images, like diagonals or curves?

Shouldn't the second layer look like a more complex iteration of the first?

+3

python machine-learning neural-network tensorflow

Wanna-be coder Apr 10 17 at 15:57

source to share