Tensorboard 101

  1. 运行py
  2. 运行tensorboard –logdir=/tmp/tensorflow/mnist/logs/mnist_with_summaries


 

#%%

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.

#

# Licensed under the Apache License, Version 2.0 (the “License”);

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

#

# http://www.apache.org/licenses/LICENSE-2.0

#

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an “AS IS” BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

# ==============================================================================

import tensorflow as tf

from tensorflow.examples.tutorials.mnist import input_data


 


 

max_steps=1000

learning_rate=0.001

dropout=0.9

data_dir=’/tmp/tensorflow/mnist/input_data’

log_dir=’/tmp/tensorflow/mnist/logs/mnist_with_summaries’


 


 

# Import data

mnist = input_data.read_data_sets(data_dir,one_hot=True)


 

sess = tf.InteractiveSession()

# Create a multilayer model.


 

# Input placeholders

with tf.name_scope(‘input’):

x = tf.placeholder(tf.float32, [None, 784], name=’x-input’)

y_ = tf.placeholder(tf.float32, [None, 10], name=’y-input’)


 

with tf.name_scope(‘input_reshape’):

image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])

tf.summary.image(‘input’, image_shaped_input, 10)


 

# We can’t initialize these variables to 0 – the network will get stuck.

def weight_variable(shape):

“””Create a weight variable with appropriate initialization.”””

initial = tf.truncated_normal(shape, stddev=0.1)

return tf.Variable(initial)


 

def bias_variable(shape):

“””Create a bias variable with appropriate initialization.”””

initial = tf.constant(0.1, shape=shape)

return tf.Variable(initial)


 

def variable_summaries(var):

“””Attach a lot of summaries to a Tensor (for TensorBoard visualization).”””

with tf.name_scope(‘summaries’):

mean = tf.reduce_mean(var)

tf.summary.scalar(‘mean’, mean)

with tf.name_scope(‘stddev’):

stddev = tf.sqrt(tf.reduce_mean(tf.square(var – mean)))

tf.summary.scalar(‘stddev’, stddev)

tf.summary.scalar(‘max’, tf.reduce_max(var))

tf.summary.scalar(‘min’, tf.reduce_min(var))

tf.summary.histogram(‘histogram’, var)


 

def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):

“””Reusable code for making a simple neural net layer.

It does a matrix multiply, bias add, and then uses relu to nonlinearize.

It also sets up name scoping so that the resultant graph is easy to read,

and adds a number of summary ops.

“””

# Adding a name scope ensures logical grouping of the layers in the graph.

with tf.name_scope(layer_name):

# This Variable will hold the state of the weights for the layer

with tf.name_scope(‘weights’):

weights = weight_variable([input_dim, output_dim])

variable_summaries(weights)

with tf.name_scope(‘biases’):

biases = bias_variable([output_dim])

variable_summaries(biases)

with tf.name_scope(‘Wx_plus_b’):

preactivate = tf.matmul(input_tensor, weights) + biases

tf.summary.histogram(‘pre_activations’, preactivate)

activations = act(preactivate, name=’activation’)

tf.summary.histogram(‘activations’, activations)

return activations


 

hidden1 = nn_layer(x, 784, 500, ‘layer1’)


 

with tf.name_scope(‘dropout’):

keep_prob = tf.placeholder(tf.float32)

tf.summary.scalar(‘dropout_keep_probability’, keep_prob)

dropped = tf.nn.dropout(hidden1, keep_prob)


 

# Do not apply softmax activation yet, see below.

y = nn_layer(dropped, 500, 10, ‘layer2’, act=tf.identity)


 

with tf.name_scope(‘cross_entropy’):

# The raw formulation of cross-entropy,

#

# tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),

# reduction_indices=[1]))

#

# can be numerically unstable.

#

# So here we use tf.nn.softmax_cross_entropy_with_logits on the

# raw outputs of the nn_layer above, and then average across

# the batch.

diff = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)

with tf.name_scope(‘total’):

cross_entropy = tf.reduce_mean(diff)

tf.summary.scalar(‘cross_entropy’, cross_entropy)


 

with tf.name_scope(‘train’):

train_step = tf.train.AdamOptimizer(learning_rate).minimize(

cross_entropy)


 

with tf.name_scope(‘accuracy’):

with tf.name_scope(‘correct_prediction’):

correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))

with tf.name_scope(‘accuracy’):

accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

tf.summary.scalar(‘accuracy’, accuracy)


 

# Merge all the summaries and write them out to /tmp/mnist_logs (by default)

merged = tf.summary.merge_all()

train_writer = tf.summary.FileWriter(log_dir + ‘/train’, sess.graph)

test_writer = tf.summary.FileWriter(log_dir + ‘/test’)

tf.global_variables_initializer().run()


 

# Train the model, and also write summaries.

# Every 10th step, measure test-set accuracy, and write test summaries

# All other steps, run train_step on training data, & add training summaries


 

def feed_dict(train):

“””Make a TensorFlow feed_dict: maps data onto Tensor placeholders.”””

if train:

xs, ys = mnist.train.next_batch(100)

k = dropout

else:

xs, ys = mnist.test.images, mnist.test.labels

k = 1.0

return {x: xs, y_: ys, keep_prob: k}


 


 

saver = tf.train.Saver()

for i in range(max_steps):

if i % 10 == 0: # Record summaries and test-set accuracy

summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))

test_writer.add_summary(summary, i)

print(‘Accuracy at step %s: %s’ % (i, acc))

else: # Record train set summaries, and train

if i % 100 == 99: # Record execution stats

run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

run_metadata = tf.RunMetadata()

summary, _ = sess.run([merged, train_step],

feed_dict=feed_dict(True),

options=run_options,

run_metadata=run_metadata)

train_writer.add_run_metadata(run_metadata, ‘step%03d’ % i)

train_writer.add_summary(summary, i)

saver.save(sess, log_dir+”/model.ckpt”, i)

print(‘Adding run metadata for’, i)

else: # Record a summary

summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))

train_writer.add_summary(summary, i)

train_writer.close()

test_writer.close()


 


 


 


 


 

tensorflow tutorials and projects

https://github.com/jtoy/awesome-tensorflow

Awesome TensorFlow 

 

Awesome

A curated list of awesome TensorFlow experiments, libraries, and projects. Inspired by awesome-machine-learning.

What is TensorFlow?

TensorFlow is an open source software library for numerical computation using data flow graphs. In other words, the best way to build deep learning models.

More info here.

Table of Contents

Tutorials

Models/Projects

Powered by TensorFlow

  • YOLO TensorFlow – Implementation of ‘YOLO : Real-Time Object Detection’
  • android-yolo – Real-time object detection on Android using the YOLO network, powered by TensorFlow.
  • Magenta – Research project to advance the state of the art in machine intelligence for music and art generation

Libraries

Videos

Papers

Official announcements

Blog posts

Community

Books

  • Machine Learning with TensorFlow by Nishant Shukla, computer vision researcher at UCLA and author of Haskell Data Analysis Cookbook. This book makes the math-heavy topic of ML approachable and practicle to a newcomer.
  • First Contact with TensorFlow by Jordi Torres, professor at UPC Barcelona Tech and a research manager and senior advisor at Barcelona Supercomputing Center
  • Deep Learning with Python – Develop Deep Learning Models on Theano and TensorFlow Using Keras by Jason Brownlee
  • TensorFlow for Machine Intelligence – Complete guide to use TensorFlow from the basics of graph computing, to deep learning models to using it in production environments – Bleeding Edge Press
  • Getting Started with TensorFlow – Get up and running with the latest numerical computing library by Google and dive deeper into your data, by Giancarlo Zaccone
  • Hands-On Machine Learning with Scikit-Learn and TensorFlow – by Aurélien Geron, former lead of the YouTube video classification team. Covers ML fundamentals, training and deploying deep nets across multiple servers and GPUs using TensorFlow, the latest CNN, RNN and Autoencoder architectures, and Reinforcement Learning (Deep Q).
  • Building Machine Learning Projects with Tensorflow – by Rodolfo Bonnin. This book covers various projects in TensorFlow that expose what can be done with TensorFlow in different scenarios. The book provides projects on training models, machine learning, deep learning, and working with various neural networks. Each project is an engaging and insightful exercise that will teach you how to use TensorFlow and show you how layers of data can be explored by working with Tensors.

Contributions

Your contributions are always welcome!

If you want to contribute to this list (please do), send me a pull request or contact me @jtoy Also, if you notice that any of the above listed repositories should be deprecated, due to any of the following reasons:

More info on the guidelines

Credits

Softmax 和 交叉熵

假如有两个数,a和b,并且a>b,如果取max,那么就直接取a,没有第二种可能


 

但有的时候我不想这样,因为这样会造成分值小的那个饥饿。所以我希望分值大的那一项经常取到,分值小的那一项也偶尔可以取到,那么我用softmax就可以了 现在还是a和b,a>b,如果我们取按照softmax来计算取a和b的概率,那a的softmax值大于b的,所以a会经常取到,而b也会偶尔取到,概率跟它们本来的大小有关。所以说不是max,而是 Soft max 那各自的概率究竟是多少呢,我们下面就来具体看一下

定义

假设我们有一个数组,V,Vi表示V中的第i个元素,那么这个元素的Softmax值就是

也就是说,是该元素的指数,与所有元素指数和的比值

这个定义可以说非常的直观,当然除了直观朴素好理解以外,它还有更多的优点

1.计算与标注样本的差距

在神经网络的计算当中,我们经常需要计算按照神经网络的正向传播计算的分数S1,和按照正确标注计算的分数S2,之间的差距,计算Loss,才能应用反向传播。Loss定义为交叉熵


 

用交叉熵来描述两者之间的关联度

括号内的Softmax取值在 (0,1]

负号取过来,取倒数,则L函数的取值在[0,很大)

如果Softmax越大,L越小。说明分号上下的值越接近,

说明衰减越小,


 


 


 


 

取log里面的值就是这组数据正确分类的Softmax值,它占的比重越大,这个样本的Loss也就越小,这种定义符合我们的要求


 


 

SVM只选自己喜欢的男神,Softmax把所有备胎全部拉出来评分,最后还归一化一下


 

关于SVM 和MLP

机器学习的本质

1 在CNN中,使用BP算法回溯错误,但是错误的定义是人之前给定好的

2 也就是说,只不过是机器以某种方式(CNN,Feature,Pooling,全连通图)获取到一个随机的概率,然后通过某种方法(BP)让这个概率符合我们给定的概率。

3 当足够接近我们的期望后,就说训练成功

4 训练成功意味着我们得到了一个好的卷积核(过滤器),这个过滤器完全是随机生成的,只要概率OK,则过滤器(二维数组)和各个边上的权重具体什么样无所谓。

5 使用这个向前的方法,可以对未知的输入进行预测,并取得不错的输出。

The above steps train the ConvNet – this essentially means that all the weights and parameters of the ConvNet have now been optimized to correctly classify images from the training set.

When a new (unseen) image is input into the ConvNet, the network would go through the forward propagation step and output a probability for each class (for a new image, the output probabilities are calculated using the weights which have been optimized to correctly classify all the previous training examples). If our training set is large enough, the network will (hopefully) generalize well to new images and classify them into correct categories.

卷积和池化

卷积


 

卷积是加权叠加


 

某一时刻的输出是之前很多次输入乘以各自的衰减系数之后的叠加而形成的某一点的输出。然后再把不同时刻的输出点放在一起,形成一个函数,就是卷积。

f(a)就是第a个巴掌,g(x-a)就是第a个巴掌在x的作用强度,乘起来再叠加


 

在图像处理中,用一个模版(卷积核)和一幅待处理图像进行卷积,就是一种积分(求和)运算,用来求两个曲线重叠区域面积,可以看作加权求和,可以进行消除噪声 特征增强。


 

【参考】http://blog.csdn.net/tiandijun/article/details/40080823

https://www.zhihu.com/question/22298352


 


 


 

http://www.cnblogs.com/zf-blog/p/6075286.html

http://blog.csdn.net/silence1214/article/details/11809947


 


 

The Convolution Step

ConvNets derive their name from the “convolution” operator. The primary purpose of Convolution in case of a ConvNet is to extract features from the input image. Convolution preserves the spatial relationship between pixels by learning image features using small squares of input data. We will not go into the mathematical details of Convolution here, but will try to understand how it works over images.

As we discussed above, every image can be considered as a matrix of pixel values. Consider a 5 x 5 image whose pixel values are only 0 and 1 (note that for a grayscale image, pixel values range from 0 to 255, the green matrix below is a special case where pixel values are only 0 and 1):

Also, consider another 3 x 3 matrix as shown below:

Then, the Convolution of the 5 x 5 image and the 3 x 3 matrix can be computed as shown in the animation in Figure 5 below:

Figure 5: The Convolution operation. The output matrix is called Convolved Feature or Feature Map. Source [7]

Take a moment to understand how the computation above is being done. We slide the orange matrix over our original image (green) by 1 pixel (also called ‘stride’) and for every position, we compute element wise multiplication (between the two matrices) and add the multiplication outputs to get the final integer which forms a single element of the output matrix (pink). Note that the 3×3 matrix “sees” only a part of the input image in each stride.

In CNN terminology, the 3×3(orange) matrix is called a ‘filter’ or ‘kernel’(过滤器/卷积核) or ‘feature detector’ and the matrix formed by sliding the filter over the image and computing the dot product is called the ‘Convolved Feature’ or ‘Activation Map’ or the ‘Feature Map‘. It is important to note that filters acts as feature detectors from the original input image.

It is evident from the animation above that different values of the filter matrix will produce different Feature Maps for the same input image. As an example, consider the following input image:

In the table below, we can see the effects of convolution of the above image with different filters. As shown, we can perform operations such as Edge Detection, Sharpen and Blur just by changing the numeric values of our filter matrix before the convolution operation [8] – this means that different filters can detect different features from an image, for example edges, curves etc. More such examples are available in Section 8.2.4 here.

Another good way to understand the Convolution operation is by looking at the animation in Figure 6 below:


 


 

特征图的大小由”深度””步长””补零”三个因素决定

深度是指使用多少个过滤器/卷积核来生成特征图,每个卷积核会产生特定的二维特征图,多个特征图按层排列形成深度。

The size of the Feature Map (Convolved Feature) is controlled by three parameters [4] that we need to decide before the convolution step is performed:

Figure 7


 


 

概述

前面的练习中,解决了一些有关低分辨率图像的问题,比如:小块图像,手写数字小幅图像等。在这部分中,我们将把已知的方法扩展到实际应用中更加常见的大图像数据集。


 

全联通网络

在稀疏自编码章节中,我们介绍了把输入层和隐含层进行”全连接”的设计。从计算的角度来讲,在其他章节中曾经用过的相对较小的图像(如在稀疏自编码的作业中用到过的 8×8 的小块图像,在MNIST数据集中用到过的28×28 的小块图像),从整幅图像中计算特征是可行的。但是,如果是更大的图像(如 96×96 的图像),要通过这种全联通网络的这种方法来学习整幅图像上的特征,从计算角度而言,将变得非常耗时。你需要设计 10 的 4 次方(=10000)个输入单元,假设你要学习 100 个特征,那么就有 10 的 6 次方个参数需要去学习。与 28×28 的小块图像相比较, 96×96 的图像使用前向输送或者后向传导的计算方式,计算过程也会慢 10 的 2 次方(=100)倍。


 

部分联通网络

解决这类问题的一种简单方法是对隐含单元和输入单元间的连接加以限制:每个隐含单元仅仅只能连接输入单元的一部分。例如,每个隐含单元仅仅连接输入图像的一小片相邻区域。(对于不同于图像输入的输入形式,也会有一些特别的连接到单隐含层的输入信号”连接区域”选择方式。如音频作为一种信号输入方式,一个隐含单元所需要连接的输入单元的子集,可能仅仅是一段音频输入所对应的某个时间段上的信号。)


 

网络部分连通的思想,也是受启发于生物学里面的视觉系统结构。视觉皮层的神经元就是局部接受信息的(即这些神经元只响应某些特定区域的刺激)。


 

卷积

自然图像有其固有特性,也就是说,图像的一部分的统计特性与其他部分是一样的。这也意味着我们在这一部分学习的特征也能用在另一部分上,所以对于这个图像上的所有位置,我们都能使用同样的学习特征。


 

更恰当的解释是,当从一个大尺寸图像中随机选取一小块,比如说 8×8 作为样本,并且从这个小块样本中学习到了一些特征,这时我们可以把从这个 8×8 样本中学习到的特征作为探测器,应用到这个图像的任意地方中去。特别是,我们可以用从 8×8 样本中所学习到的特征跟原本的大尺寸图像作卷积,从而对这个大尺寸图像上的任一位置获得一个不同特征的激活值。


 

下面给出一个具体的例子:假设你已经从一个 96×96 的图像中学习到了它的一个 8×8 的样本所具有的特征,假设这是由有 100 个隐含单元的自编码完成的。为了得到卷积特征,需要对 96×96 的图像的每个 8×8 的小块图像区域都进行卷积运算。也就是说,抽取 8×8 的小块区域,并且从起始坐标开始依次标记为(1,1),(1,2),…,一直到(89,89),然后对抽取的区域逐个运行训练过的稀疏自编码来得到特征的激活值。在这个例子里,显然可以得到 100 个集合,每个集合含有 89×89 个卷积特征。


 

假设给定了

的大尺寸图像,将其定义为 xlarge。首先通过从大尺寸图像中抽取的

的小尺寸图像样本 xsmall 训练稀疏自编码,计算 f = σ(W(1)xsmall + b(1))(σ 是一个 sigmoid 型函数)得到了 k 个特征, 其中 W(1)b(1) 是可视层单元和隐含单元之间的权重和偏差值。对于每一个

大小的小图像 xs,计算出对应的值 fs = σ(W(1)xs + b(1)),对这些 fconvolved 值做卷积,就可以得到

个卷积后的特征的矩阵。


 

在接下来的章节里,我们会更进一步描述如何把这些特征汇总到一起以得到一些更利于分类的特征。


 

中英文对照

全联通网络 Full Connected Networks

稀疏编码 Sparse Autoencoder

前向输送 Feedforward

反向传播 Backpropagation

部分联通网络 Locally Connected Networks

连接区域 Contiguous Groups

视觉皮层 Visual Cortex

卷积 Convolution

固有特征 Stationary

池化 Pool


 

池化: 概述

在通过卷积获得了特征 (features) 之后,下一步我们希望利用这些特征去做分类。理论上讲,人们可以用所有提取得到的特征去训练分类器,例如 softmax 分类器,但这样做面临计算量的挑战。例如:对于一个 96X96 像素的图像,假设我们已经学习得到了400个定义在8X8输入上的特征,每一个特征和图像卷积都会得到一个 (96 − 8 + 1) * (96 − 8 + 1) = 7921 维的卷积特征,由于有 400 个特征,所以每个样例 (example) 都会得到一个 892 * 400 = 3,168,400 维的卷积特征向量。学习一个拥有超过 3 百万特征输入的分类器十分不便,并且容易出现过拟合 (over-fitting)。


 

为了解决这个问题,首先回忆一下,我们之所以决定使用卷积后的特征是因为图像具有一种”静态性”的属性,这也就意味着在一个图像区域有用的特征极有可能在另一个区域同样适用。因此,为了描述大的图像,一个很自然的想法就是对不同位置的特征进行聚合统计,例如,人们可以计算图像一个区域上的某个特定特征的平均值 (或最大值)。这些概要统计特征不仅具有低得多的维度 (相比使用所有提取得到的特征),同时还会改善结果(不容易过拟合)。这种聚合的操作就叫做池化 (pooling),有时也称为平均池化或者最大池化 (取决于计算池化的方法)。


 

最大池化:


 


 


 


 

layer1 edge

layer2 shape

layer3 face


 


 

Max is better

下图显示池化如何应用于一个图像的四块不重合区域。


 

池化的不变性

如果人们选择图像中的连续范围作为池化区域,并且只是池化相同(重复)的隐藏单元产生的特征,那么,这些池化单元就具有平移不变性 (translation invariant)。这就意味着即使图像经历了一个小的平移之后,依然会产生相同的 (池化的) 特征。在很多任务中 (例如物体检测、声音识别),我们都更希望得到具有平移不变性的特征,因为即使图像经过了平移,样例(图像)的标记仍然保持不变。例如,如果你处理一个MNIST数据集的数字,把它向左侧或右侧平移,那么不论最终的位置在哪里,你都会期望你的分类器仍然能够精确地将其分类为相同的数字。

(*MNIST 是一个手写数字库识别库: http://yann.lecun.com/exdb/mnist/)


 

形式化描述

形式上,在获取到我们前面讨论过的卷积特征后,我们要确定池化区域的大小(假定为

),来池化我们的卷积特征。那么,我们把卷积特征划分到数个大小为

的不相交区域上,然后用这些区域的平均(或最大)特征来获取池化后的卷积特征。这些池化后的特征便可以用来做分类。


 

中英文对照

特征 features

样例 example

过拟合 over-fitting

平移不变性 translation invariant

池化 pooling

提取 extract

物体检测 object detection

SIGMOID函数的意义?

神经网络中的SIGMOID函数的意义?


 

神经网络中的激活函数,其作用就是引入非线性。具体的非线性形式,则有多种选择。

sigmoid的优点在于输出范围有限,所以数据在传递的过程中不容易发散。当然也有相应的缺点,就是饱和的时候梯度太小。

sigmoid还有一个优点是输出范围为(0, 1),所以可以用作输出层,输出表示概率。


 


 

一种可能解释是如果对binary RBM的能量函数定义的概率分布求marginal(或者conditional),那么隐变量和显变量的marginal(或者conditional)就是sigmoid形式…

sigmoid在指数函数族(Exponential Family)里面是标准的bernoulli marginal,而exponential family是一个给定数据最大熵的函数族,在线性产生式模型(generative model)里面是属于「最优」的一类。直观理解是熵大的模型比熵小的更健壮(robust),受数据噪声影响小。

不过神经网络如果说最大熵就相当于是硬拉亲戚了。。


 

编辑于 2016-01-21


 

张金超

自然语言处理,机器翻译


 

我觉得可以从无穷级数的角度去理解,指数函数的无穷级数是各种分量的高阶组合。sigmoid中有指数项,且导数形式好,值域也好。


 

发布于 2016-07-24


 

匿名用户


 

因为Sigmoid的函数性质和神经学的神经元的突触一致,而且好求导。


 

发布于 2016-02-09


 

王彩玲

是有联系的,而且在很多的层面上。Sigmoid函数也是高斯分布的和,而高斯又是随机变量的和,世界本源也是随机的,大量的随机似乎有时候显示出一些规律,其实也是中心极限,这也是最大信息熵的来源,基础都是和费马原理。而要证明费马原理,证明费马原理,就需要证明可微可导的问题,那从哲学上,就需要说明人类认识的线性特征,这样就涉及到神经元对世界的反应方式。。。。这样就实现了循环证明,说明是自洽的


 

发布于 2016-10-18


 

高志华

不会写段子的厨子不是一个好的程序员


 

求导容易噻,反向传播多爽

编辑于 2017-04-29


 

Homunculus

装逼与打击装逼者是我上知乎的乐趣


 

哪有那么多玄乎的东西,不过是一个两头平中间陡,值域是0到1,关于0,1/2对称的函数罢了。就一个经验的东西,你选arctanx也一样用


 

发布于 2017-04-04


 

侍世腾

活到老,学到老


 

单向递增,输出在0-1之间


 

发布于 2016-05-04