What is This?

VAE(とAE)の中間層の状態を見てみたいなと思い、可視化してgifにしました.
以前からVAEでmnistの複数の数字を入れたらわかりにくいと感じていたので、1文字で学習させています.

可視化コードを加えただけですが「コード」のタブにソースコードを置いておきました. 使いまわせる方がいると幸いです.

gifにはImageMagickをインストールして以下でgif化

convert -delay 10 -loop 0 frame_*.png ae.gif

下記では、全て中間層(隠れ層)を2次元にしています.

VAE Visualize Results

VAE with 10 digits

まずは通常の10種類のmnistの学習です.少しずつ真ん中に全体の山が寄って行きます.
可視化に使用している画像はテストデータで学習には使用していません.

VAE with 1 digits

次に数字の3の学習だけを行い、テスト画像にも3の数字だけを入力しています.
こころなしか周辺にあまりお目にかからない崩れた3が来ていて、真ん中にはよくみる3が来ているように見えなくもない...
全体的にガウス分布になろうとしている感じがあります.

VAE with 1 digits & other digits

次に数字の3の学習だけを行い、テスト画像には3の数字と他の数字も入力として入れてみてます.
私の期待では,3以外の数字は全て外側に来て内側は3しかない状態になることを思い描いていましたが
やはり潜在変数2が少なすぎるせいかそこまで上手くは行きませんでした...

2とか6とかがやや侵入してきています.

AE Visualize Results

AE with 10 digits

まずは通常の10種類のmnistの学習です.0~1の間に収まっているのはsigmoidのおかげです.
ReLuにしているものをあまり見かけなかったのでここのままにしています.

AE with 1 digits (FAIL)

次に3だけにしましたが、どうも上手く学習されなくなった気がします.
原因はまだ調べているところです.特にできなくなる理由は見つからない気がするのですが...

コードの大部分はaymericdamienさんのコードをベースにしています. Gist等含めて右UrusuLambda's Github

Code (VAE just visualize 2d hidden layer. Multiple Digits)

10種類のMNISTをVAEで学習させているときのコードです.元のコードに可視化用のコードを加えて、中間層から値を取得して2Dにプロットしています.

#!/usr/bin/env python
from __future__ import division, print_function, absolute_import
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import tensorflow as tf
from matplotlib.offsetbox import (TextArea, DrawingArea, OffsetImage,
AnnotationBbox)
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Parameters
learning_rate = 0.001
num_steps = 60000
batch_size = 64
# Network Parameters
image_dim = 784 # MNIST images are 28x28 pixels
hidden_dim = 512
latent_dim = 2
# A custom initialization (see Xavier Glorot init)
def glorot_init(shape):
return tf.random_normal(shape=shape, stddev=1. / tf.sqrt(shape[0] / 2.))
# Variables
weights = {
'encoder_h1': tf.Variable(glorot_init([image_dim, hidden_dim])),
'z_mean': tf.Variable(glorot_init([hidden_dim, latent_dim])),
'z_std': tf.Variable(glorot_init([hidden_dim, latent_dim])),
'decoder_h1': tf.Variable(glorot_init([latent_dim, hidden_dim])),
'decoder_out': tf.Variable(glorot_init([hidden_dim, image_dim]))
}
biases = {
'encoder_b1': tf.Variable(glorot_init([hidden_dim])),
'z_mean': tf.Variable(glorot_init([latent_dim])),
'z_std': tf.Variable(glorot_init([latent_dim])),
'decoder_b1': tf.Variable(glorot_init([hidden_dim])),
'decoder_out': tf.Variable(glorot_init([image_dim]))
}
# Building the encoder
input_image = tf.placeholder(tf.float32, shape=[None, image_dim])
encoder = tf.matmul(input_image, weights['encoder_h1']) + biases['encoder_b1']
encoder = tf.nn.tanh(encoder)
z_mean = tf.matmul(encoder, weights['z_mean']) + biases['z_mean']
z_std = tf.matmul(encoder, weights['z_std']) + biases['z_std']
# Sampler: Normal (gaussian) random distribution
eps = tf.random_normal(tf.shape(z_std), dtype=tf.float32, mean=0., stddev=1.0,
name='epsilon')
z = z_mean + tf.exp(z_std / 2) * eps
# Building the decoder (with scope to re-use these layers later)
decoder = tf.matmul(z, weights['decoder_h1']) + biases['decoder_b1']
decoder = tf.nn.tanh(decoder)
decoder = tf.matmul(decoder, weights['decoder_out']) + biases['decoder_out']
decoder = tf.nn.sigmoid(decoder)
# Define VAE Loss
def vae_loss(x_reconstructed, x_true):
# Reconstruction loss
encode_decode_loss = x_true * tf.log(1e-10 + x_reconstructed) \
+ (1 - x_true) * tf.log(1e-10 + 1 - x_reconstructed)
encode_decode_loss = -tf.reduce_sum(encode_decode_loss, 1)
# KL Divergence loss
kl_div_loss = 1 + z_std - tf.square(z_mean) - tf.exp(z_std)
kl_div_loss = -0.5 * tf.reduce_sum(kl_div_loss, 1)
return tf.reduce_mean(encode_decode_loss + kl_div_loss)
loss_op = vae_loss(decoder, input_image)
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
plt.ion()
testnums = 200
images = mnist.test.images
labels = mnist.test.labels
index = 0
# Start training
with tf.Session() as sess:
# Run the initializer
sess.run(init)
for i in range(1, num_steps+1):
# Prepare Data
# Get the next batch of MNIST data (only images are needed, not labels)
batch_x, _ = mnist.train.next_batch(batch_size)
# Train
feed_dict = {input_image: batch_x}
_, l = sess.run([train_op, loss_op], feed_dict=feed_dict)
if i % 1000 == 0 or i == 1:
print('Step %i, Loss: %f' % (i, l))
if i % 200 == 0:
feed_dict = {input_image: images[:testnums]}
results = sess.run(z_mean, feed_dict=feed_dict)
xlist = [result[0] for result in results]
ylist = [result[1] for result in results]
plt.clf()
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1,1,1)
ax.set_xlim([-4,4])
ax.set_ylim([-4,4])
colors = ["red", "sienna", "gold", "olivedrab", "darkgreen", "paleturquoise", "deepskyblue", "navy", "m", "mediumvioletred"]
for j in range(testnums):
aimg = images[j]
aimg = aimg.reshape(28, 28)
imb0 = OffsetImage(aimg, cmap='gray', zoom=1.6)
imb0.image.axes = ax
label = labels[j].argmax()
ab0 = AnnotationBbox(imb0, results[j], xycoords="data", boxcoords="offset points", pad=0, frameon=True, bboxprops = {"linewidth":5, "edgecolor":colors[label]})
ax.add_artist(ab0)
#plt.scatter(xlist, ylist, c=range(0,testnums))
plt.savefig("vaimgs/frame_%04d.png" % index)
index += 1
# Testing
# Generator takes noise as input
noise_input = tf.placeholder(tf.float32, shape=[None, latent_dim])
# Rebuild the decoder to create image from noise
decoder = tf.matmul(noise_input, weights['decoder_h1']) + biases['decoder_b1']
decoder = tf.nn.tanh(decoder)
decoder = tf.matmul(decoder, weights['decoder_out']) + biases['decoder_out']
decoder = tf.nn.sigmoid(decoder)
# Building a manifold of generated digits
n = 20
x_axis = np.linspace(-3, 3, n)
y_axis = np.linspace(-3, 3, n)
canvas = np.empty((28 * n, 28 * n))
for i, yi in enumerate(x_axis):
for j, xi in enumerate(y_axis):
z_mu = np.array([[xi, yi]] * batch_size)
x_mean = sess.run(decoder, feed_dict={noise_input: z_mu})
canvas[(n - i - 1) * 28:(n - i) * 28, j * 28:(j + 1) * 28] = \
x_mean[0].reshape(28, 28)
plt.figure(figsize=(8, 10))
Xi, Yi = np.meshgrid(x_axis, y_axis)
plt.imshow(canvas, origin="upper", cmap="gray")
plt.show()

Code (VAE just visualize 2d hidden layer. Single Digits)

1種類のMNISTをVAEで学習させているときのコードです.(大して一つめと変わりません)

#!/usr/bin/env python
from __future__ import division, print_function, absolute_import
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import tensorflow as tf
from matplotlib.offsetbox import (TextArea, DrawingArea, OffsetImage,
AnnotationBbox)
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
# Parameters
learning_rate = 0.001
num_steps = 60000
batch_size = 64
# Network Parameters
image_dim = 784 # MNIST images are 28x28 pixels
hidden_dim = 512
latent_dim = 2
# A custom initialization (see Xavier Glorot init)
def glorot_init(shape):
return tf.random_normal(shape=shape, stddev=1. / tf.sqrt(shape[0] / 2.))
# Variables
weights = {
'encoder_h1': tf.Variable(glorot_init([image_dim, hidden_dim])),
'z_mean': tf.Variable(glorot_init([hidden_dim, latent_dim])),
'z_std': tf.Variable(glorot_init([hidden_dim, latent_dim])),
'decoder_h1': tf.Variable(glorot_init([latent_dim, hidden_dim])),
'decoder_out': tf.Variable(glorot_init([hidden_dim, image_dim]))
}
biases = {
'encoder_b1': tf.Variable(glorot_init([hidden_dim])),
'z_mean': tf.Variable(glorot_init([latent_dim])),
'z_std': tf.Variable(glorot_init([latent_dim])),
'decoder_b1': tf.Variable(glorot_init([hidden_dim])),
'decoder_out': tf.Variable(glorot_init([image_dim]))
}
# Building the encoder
input_image = tf.placeholder(tf.float32, shape=[None, image_dim])
encoder = tf.matmul(input_image, weights['encoder_h1']) + biases['encoder_b1']
encoder = tf.nn.tanh(encoder)
z_mean = tf.matmul(encoder, weights['z_mean']) + biases['z_mean']
z_std = tf.matmul(encoder, weights['z_std']) + biases['z_std']
# Sampler: Normal (gaussian) random distribution
eps = tf.random_normal(tf.shape(z_std), dtype=tf.float32, mean=0., stddev=1.0,
name='epsilon')
z = z_mean + tf.exp(z_std / 2) * eps
# Building the decoder (with scope to re-use these layers later)
decoder = tf.matmul(z, weights['decoder_h1']) + biases['decoder_b1']
decoder = tf.nn.tanh(decoder)
decoder = tf.matmul(decoder, weights['decoder_out']) + biases['decoder_out']
decoder = tf.nn.sigmoid(decoder)
# Define VAE Loss
def vae_loss(x_reconstructed, x_true):
# Reconstruction loss
encode_decode_loss = x_true * tf.log(1e-10 + x_reconstructed) \
+ (1 - x_true) * tf.log(1e-10 + 1 - x_reconstructed)
encode_decode_loss = -tf.reduce_sum(encode_decode_loss, 1)
# KL Divergence loss
kl_div_loss = 1 + z_std - tf.square(z_mean) - tf.exp(z_std)
kl_div_loss = -0.5 * tf.reduce_sum(kl_div_loss, 1)
return tf.reduce_mean(encode_decode_loss + kl_div_loss)
loss_op = vae_loss(decoder, input_image)
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
plt.ion()
testnums = 200
teimages = mnist.test.images
telabels = mnist.test.labels
test_images = {}
test_labels = {}
for i in range(0, 10):
test_idx = telabels == i
test_images[i] = teimages[test_idx]
test_labels[i] = telabels[test_idx]
index = 0
target_index = 3
target_test_images = test_images[target_index][:testnums]
target_test_labels = test_labels[target_index][:testnums]
for i in range(0, 10):
if i != 3:
target_test_images = np.append(target_test_images, test_images[i][:5],axis=0)
target_test_labels = np.append(target_test_labels, test_labels[i][:5],axis=0)
# Start training
with tf.Session() as sess:
# Run the initializer
sess.run(init)
for i in range(1, num_steps+1):
# Prepare Data
# Get the next batch of MNIST data (only images are needed, not labels)
batch_x, batch_y = mnist.train.next_batch(batch_size)
idx = batch_y == target_index
batch_x = batch_x[idx]
# Train
feed_dict = {input_image: batch_x}
_, l = sess.run([train_op, loss_op], feed_dict=feed_dict)
if i % 1000 == 0 or i == 1:
print('Step %i, Loss: %f' % (i, l))
if i % 200 == 0:
feed_dict = {input_image: target_test_images}
results = sess.run(z_mean, feed_dict=feed_dict)
xlist = [result[0] for result in results]
ylist = [result[1] for result in results]
plt.clf()
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1,1,1)
ax.set_xlim([-4,4])
ax.set_ylim([-4,4])
colors = ["red", "sienna", "gold", "olivedrab", "darkgreen", "paleturquoise", "deepskyblue", "navy", "m", "mediumvioletred"]
for j in range(testnums + 45):
aimg = target_test_images[j]
aimg = aimg.reshape(28, 28)
imb0 = OffsetImage(aimg, cmap='gray', zoom=1.6)
imb0.image.axes = ax
ab0 = AnnotationBbox(imb0, results[j], xycoords="data", boxcoords="offset points", pad=0, frameon=True, bboxprops = {"linewidth":5, "edgecolor":colors[target_test_labels[j]]})
ax.add_artist(ab0)
#plt.scatter(xlist, ylist, c=range(0,testnums))
plt.savefig("svaimgs/frame_%04d.png" % index)
index += 1
# Testing
# Generator takes noise as input
noise_input = tf.placeholder(tf.float32, shape=[None, latent_dim])
# Rebuild the decoder to create image from noise
decoder = tf.matmul(noise_input, weights['decoder_h1']) + biases['decoder_b1']
decoder = tf.nn.tanh(decoder)
decoder = tf.matmul(decoder, weights['decoder_out']) + biases['decoder_out']
decoder = tf.nn.sigmoid(decoder)
# Building a manifold of generated digits
n = 20
x_axis = np.linspace(-3, 3, n)
y_axis = np.linspace(-3, 3, n)
canvas = np.empty((28 * n, 28 * n))
for i, yi in enumerate(x_axis):
for j, xi in enumerate(y_axis):
z_mu = np.array([[xi, yi]] * batch_size)
x_mean = sess.run(decoder, feed_dict={noise_input: z_mu})
canvas[(n - i - 1) * 28:(n - i) * 28, j * 28:(j + 1) * 28] = \
x_mean[0].reshape(28, 28)
plt.figure(figsize=(8, 10))
Xi, Yi = np.meshgrid(x_axis, y_axis)
plt.imshow(canvas, origin="upper", cmap="gray")
plt.show()

Code (AE just visualize 2d hidden layer. 10 Digits)

10種類のMNISTをAEで学習させているときのコードです.(大して一つめと変わりません)

#!/usr/bin/env python
from __future__ import division, print_function, absolute_import
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.offsetbox import (TextArea, DrawingArea, OffsetImage,
AnnotationBbox)
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Training Parameters
learning_rate = 0.01
num_steps = 60000
batch_size = 256
display_step = 1000
examples_to_show = 10
# Network Parameters
num_hidden_1 = 256 # 1st layer num features
num_hidden_2 = 2#128 # 2nd layer num features (the latent dim)
num_input = 784 # MNIST data input (img shape: 28*28)
# tf Graph input (only pictures)
X = tf.placeholder("float", [None, num_input])
weights = {
'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1])),
'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2])),
'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1])),
'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input])),
}
biases = {
'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2])),
'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
'decoder_b2': tf.Variable(tf.random_normal([num_input])),
}
# Building the encoder
def encoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']),
biases['encoder_b1']))
# Encoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
biases['encoder_b2']))
return layer_2
# Building the decoder
def decoder(x):
# Decoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']),
biases['decoder_b1']))
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']),
biases['decoder_b2']))
return layer_2
# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X
# Define loss and optimizer, minimize the squared error
loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
plt.ion()
testnums = 200
colors = ["red", "sienna", "gold", "olivedrab", "darkgreen", "paleturquoise", "deepskyblue", "navy", "m", "mediumvioletred"]
images = mnist.test.images
labels = mnist.test.labels
index = 0
# Start Training
# Start a new TF session
with tf.Session() as sess:
# Run the initializer
sess.run(init)
# Training
for i in range(1, num_steps+1):
# Prepare Data
# Get the next batch of MNIST data (only images are needed, not labels)
batch_x, _ = mnist.train.next_batch(batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
_, l = sess.run([optimizer, loss], feed_dict={X: batch_x})
# Display logs per step
if i % display_step == 0 or i == 1:
print('Step %i: Minibatch Loss: %f' % (i, l))
if i % 200 == 0:
feed_dict = {X : images[:testnums]}
results = sess.run(encoder_op, feed_dict=feed_dict)
xlist = [result[0] for result in results]
ylist = [result[1] for result in results]
plt.clf()
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1,1,1)
ax.set_xlim([min(xlist)-0.05,max(xlist) + 0.05])
ax.set_ylim([min(ylist)-0.05,max(ylist) + 0.05])
for j in range(testnums):
aimg = images[j]
aimg = aimg.reshape(28, 28)
imb0 = OffsetImage(aimg, cmap='gray',zoom=1.6)
imb0.image.axes = ax
label = labels[j].argmax()
ab0 = AnnotationBbox(imb0, results[j], xycoords="data", boxcoords="offset points", pad=0, frameon=True, bboxprops = {"linewidth":5, "edgecolor":colors[label]})
ax.add_artist(ab0)
#plt.scatter(xlist, ylist, c=range(0,testnums))
plt.savefig("aeimgs/frame_%04d.png" % index)
index += 1
# Testing
# Encode and decode images from test set and visualize their reconstruction.
n = 4
canvas_orig = np.empty((28 * n, 28 * n))
canvas_recon = np.empty((28 * n, 28 * n))
for i in range(n):
# MNIST test set
batch_x, _ = mnist.test.next_batch(n)
# Encode and decode the digit image
g = sess.run(decoder_op, feed_dict={X: batch_x})
# Display original images
for j in range(n):
# Draw the original digits
canvas_orig[i * 28:(i + 1) * 28, j * 28:(j + 1) * 28] = \
batch_x[j].reshape([28, 28])
# Display reconstructed images
for j in range(n):
# Draw the reconstructed digits
canvas_recon[i * 28:(i + 1) * 28, j * 28:(j + 1) * 28] = \
g[j].reshape([28, 28])
print("Original Images")
plt.figure(figsize=(n, n))
plt.imshow(canvas_orig, origin="upper", cmap="gray")
plt.show()
print("Reconstructed Images")
plt.figure(figsize=(n, n))
plt.imshow(canvas_recon, origin="upper", cmap="gray")
plt.show()