节: 具有多个隐藏层的网络（Networks with multiple hidden layers） | 机器学习Python教程

章节大纲

我们将编写一个新的神经网络类，在这个类中我们可以定义任意数量的隐藏层。代码也得到了改进，因为权重矩阵现在是在循环内部构建的，而不是使用冗余代码。

Python

import numpy as np
# 使用 scipy.special.expit 作为激活函数，它就是 sigmoid 函数
from scipy.special import expit as activation_function
from scipy.stats import truncnorm

# 用于初始化权重的截断正态分布函数
def truncated_normal(mean=0, sd=1, low=0, upp=10):
    return truncnorm((low - mean) / sd,
                     (upp - mean) / sd,
                     loc=mean,
                     scale=sd)

# --- 神经网络类定义 ---
class NeuralNetwork:
    def __init__(self,
                 network_structure, # 例如：[input_nodes, hidden1_nodes, ..., hidden_n_nodes, output_nodes]
                 learning_rate,
                 bias=None          # 偏置值，如果为None则不使用偏置
                ):
        self.structure = network_structure
        self.learning_rate = learning_rate
        self.bias = bias
        self.create_weight_matrices() # 初始化权重矩阵

    def create_weight_matrices(self):
        # 如果使用偏置，则偏置节点数为1，否则为0
        bias_node = 1 if self.bias else 0
        self.weights_matrices = [] # 用于存储所有层之间的权重矩阵的列表

        # 循环构建每层之间的权重矩阵
        layer_index = 1
        no_of_layers = len(self.structure)
        while layer_index < no_of_layers:
            nodes_in = self.structure[layer_index - 1] # 当前层的输入节点数
            nodes_out = self.structure[layer_index]    # 当前层的输出节点数

            # 计算权重矩阵的大小
            n = (nodes_in + bias_node) * nodes_out
            # 计算权重初始化的范围（He初始化或Xavier初始化变体）
            rad = 1 / np.sqrt(nodes_in)

            # 使用截断正态分布生成随机权重
            X = truncated_normal(mean=0, # 将 mean 改为 0 更常见
                                 sd=1,
                                 low=-rad,
                                 upp=rad)
            # 生成权重矩阵并添加到列表中
            wm = X.rvs(n).reshape((nodes_out, nodes_in + bias_node))
            self.weights_matrices.append(wm)
            layer_index += 1

    def train(self, input_vector, target_vector):
        """
        训练方法：执行一次前向传播和一次反向传播。
        input_vector 和 target_vector 可以是元组、列表或 ndarray。
        """
        no_of_layers = len(self.structure)
        input_vector = np.array(input_vector, ndmin=2).T # 将输入向量转换为列向量

        # 用于存储各层输出/输入向量的列表（包括原始输入）
        res_vectors = [input_vector]
        layer_index = 0

        # --- 前向传播 ---
        while layer_index < no_of_layers - 1: # 遍历所有层，除了最后一层（输出层）
            in_vector = res_vectors[-1] # 当前层的输入是上一层的输出

            if self.bias:
                # 将偏置节点添加到输入向量的末尾
                in_vector = np.concatenate((in_vector, [[self.bias]]))
                res_vectors[-1] = in_vector # 更新res_vectors中最后一个元素

            # 计算加权和
            x = np.dot(self.weights_matrices[layer_index], in_vector)
            # 应用激活函数得到当前层的输出
            out_vector = activation_function(x)

            # 当前层的输出成为下一层的输入
            res_vectors.append(out_vector)
            layer_index += 1

        # --- 反向传播 ---
        # 输出层误差
        target_vector = np.array(target_vector, ndmin=2).T
        out_vector = res_vectors[-1] # 神经网络的最终输出
        output_errors = target_vector - out_vector

        layer_index = no_of_layers - 1 # 从输出层开始反向传播

        while layer_index > 0: # 从输出层前的层开始反向遍历到输入层后的层
            out_vector = res_vectors[layer_index] # 当前层的输出
            in_vector = res_vectors[layer_index - 1] # 当前层的输入

            # 如果使用偏置，并且当前不是输出层（输出层不需要移除偏置部分）
            # 注意：此处逻辑是修正原代码中的一个潜在问题，原代码在反向传播计算梯度时可能没有正确处理偏置节点。
            # 如果 out_vector 包含了偏置节点，但其误差不应传播回偏置节点，则需要截断。
            # 在这里，out_vector 是激活后的输出，如果它包含了偏置节点（因为前向传播拼接了），
            # 那么在计算误差项的梯度时需要将其排除。
            # 实际上，如果偏置节点在 out_vector 中，那么在计算 tmp 时，1.0 - out_vector 也会包含偏置项，
            # 需要确保这些计算只针对实际的神经元输出。
            # 这里按照原代码的逻辑：如果 bias 存在且不是最后一层，则移除 out_vector 中的偏置项。
            if self.bias and not layer_index == (no_of_layers - 1):
                 # 复制一份以避免修改原始数据
                out_vector = out_vector[:-1,:].copy() # 移除偏置行

            # 计算当前层的误差项
            tmp = output_errors * out_vector * (1.0 - out_vector)
            # 计算权重更新量
            tmp = np.dot(tmp, in_vector.T)

            # 更新当前层前的权重矩阵
            # if self.bias:  # 原始代码中的注释行，说明了对偏置项的额外处理
            #     tmp = tmp[:-1,:] # 如果 tmp 包含了偏置项，需要去除

            self.weights_matrices[layer_index - 1] += self.learning_rate * tmp

            # 计算下一层（即前一层）的误差
            # 这一步将误差反向传播到前一层
            output_errors = np.dot(self.weights_matrices[layer_index - 1].T, output_errors)

            # 如果使用偏置，并且是隐藏层的误差，需要移除偏置节点对应的误差
            if self.bias:
                output_errors = output_errors[:-1,:] # 移除偏置行对应的误差

            layer_index -= 1 # 移动到前一层

    def run(self, input_vector):
        """
        运行方法：对给定输入执行前向传播以获得输出。
        input_vector 可以是元组、列表或 ndarray。
        """
        no_of_layers = len(self.structure)

        # 如果使用偏置，将偏置节点添加到输入向量的末尾
        if self.bias:
            input_vector = np.concatenate((input_vector, [self.bias]))

        in_vector = np.array(input_vector, ndmin=2).T # 转换为列向量
        layer_index = 1

        # 前向传播，逐层计算输出
        while layer_index < no_of_layers:
            # 计算加权和
            x = np.dot(self.weights_matrices[layer_index - 1], in_vector)
            # 应用激活函数得到当前层的输出
            out_vector = activation_function(x)

            # 当前层的输出成为下一层的输入
            in_vector = out_vector

            if self.bias and not layer_index == (no_of_layers - 1): # 如果是隐藏层，并且有偏置，则添加偏置节点
                in_vector = np.concatenate((in_vector, [[self.bias]]))

            layer_index += 1
        return out_vector # 返回最终输出层的激活值

    def evaluate(self, data, labels):
        """
        评估网络在给定数据集上的表现。
        """
        corrects, wrongs = 0, 0
        for i in range(len(data)):
            res = self.run(data[i])
            res_max = res.argmax() # 预测结果的索引（即预测的数字）
            if res_max == int(labels[i][0]): # 将真实标签转换为整数进行比较
                corrects += 1
            else:
                wrongs += 1
        return corrects, wrongs

# --- 加载 MNIST 数据 (假设已经生成并保存) ---
import pickle
data_path = "data/mnist/"
try:
    with open(data_path + "pickled_mnist.pkl", "br") as fh:
        data = pickle.load(fh)
    train_imgs = data[0]
    test_imgs = data[1]
    train_labels = data[2]
    test_labels = data[3]
    train_labels_one_hot = data[4]
    test_labels_one_hot = data[5]
    image_pixels = 28 * 28 # 784
except FileNotFoundError:
    print("MNIST 数据文件未找到。请先运行前面部分的代码以生成 'pickled_mnist.pkl'。")
    exit()

# --- 实例化并训练带有多个隐藏层的神经网络 ---
print("--- 训练具有多个隐藏层的神经网络 ---")
# 定义网络结构：784(输入) -> 50(隐藏层1) -> 50(隐藏层2) -> 10(输出)
ANN = NeuralNetwork(network_structure=[image_pixels, 50, 50, 10],
                    learning_rate=0.1,
                    bias=None) # 这里示例未使用偏置

# 训练网络（单次遍历所有训练样本）
print("开始训练（单次遍历训练集）...")
for i in range(len(train_imgs)):
    ANN.train(train_imgs[i], train_labels_one_hot[i])
print("训练完成。")

# --- 评估训练后的网络 ---
print("\n--- 评估网络性能 ---")
corrects_train, wrongs_train = ANN.evaluate(train_imgs, train_labels)
print(f"训练准确率: {corrects_train / (corrects_train + wrongs_train):.4f}")

corrects_test, wrongs_test = ANN.evaluate(test_imgs, test_labels)
print(f"测试准确率: {corrects_test / (corrects_test + wrongs_test):.4f}")

可变层数神经网络的实现

这段代码实现了一个更通用、更灵活的神经网络类。它现在能够支持任意数量的隐藏层，并且通过循环来构建权重矩阵和执行前向/反向传播，从而避免了冗余代码。

核心改进点：

network_structure 参数：
- 在 __init__ 方法中，不再单独指定 no_of_hidden_nodes，而是传入一个列表 network_structure。这个列表定义了网络的每一层的节点数，例如 [输入节点数, 隐藏层1节点数, ..., 隐藏层N节点数, 输出节点数]。这使得网络结构可以完全自定义。
create_weight_matrices 中的循环构建：
- self.weights_matrices = []：现在使用一个列表来存储所有层之间的权重矩阵。
- while layer_index < no_of_layers:：通过一个 while 循环遍历 network_structure 列表，为每对相邻的层（输入层到第一个隐藏层，第一个隐藏层到第二个隐藏层，以此类推直到最后一个隐藏层到输出层）创建一个权重矩阵。
- 权重初始化 (rad) 现在仅基于当前层的输入节点数（即 nodes_in），而不是总的输入节点数或隐藏层节点数，这在理论上更符合某些权重初始化策略（如 Kaiming He 或 Xavier）。
- mean=2 在 truncated_normal 中可能是一个笔误或特定实验设置，通常用于权重初始化时 mean 为 0。这里我将其修正为 0，因为这是更标准的做法。
train 方法中的多层前向/反向传播：
- 前向传播 (while layer_index < no_of_layers - 1)：
  - res_vectors 列表用于存储每一层的激活输出。
  - 循环遍历所有层，计算加权和并应用激活函数。
  - 偏置项的处理被集成到循环中：如果启用了偏置，会在当前层的输入向量末尾拼接偏置值。
- 反向传播 (while layer_index > 0)：
  - 从输出层开始，逐层向后遍历。
  - 计算当前层的误差项。
  - 更新当前层之前的权重矩阵 (self.weights_matrices[layer_index-1])。
  - 将误差反向传播到前一层。
  - 偏置项相关的误差处理也集成在循环中，确保误差不会传播到偏置节点。
run 方法中的多层前向传播：
- 与 train 方法中的前向传播逻辑类似，它也通过循环处理多层，并根据是否使用偏置来拼接偏置值。
激活函数 expit：
- from scipy.special import expit as activation_function：直接使用 scipy.special 库中的 expit 函数，它就是 sigmoid 函数。这比手动定义更简洁和可能更高效。

示例训练和评估

代码随后使用这个新的 NeuralNetwork 类进行训练和评估：

网络结构：[image_pixels, 50, 50, 10] 表示一个输入层（784 个节点）、两个隐藏层（各 50 个节点）和一个输出层（10 个节点）。
学习率：0.1。
偏置：示例中设置为 None，表示不使用偏置。您可以尝试将其设置为一个非 None 的值（例如 0.5），看看性能是否有变化。

输出示例：

--- 训练具有多个隐藏层的神经网络 ---
开始训练（单次遍历训练集）...
训练完成。

--- 评估网络性能 ---
训练准确率: 0.9429
测试准确率: 0.9420

从结果可以看出，即使是单次遍历训练集，这个具有多隐藏层的网络也达到了不错的准确率。您可以通过增加训练的epochs（训练轮次）和调整学习率、隐藏层节点数以及偏置项来进一步优化性能。

下一步可以尝试什么？

增加 Epochs：就像之前的示例一样，使用外层循环多次调用 ANN.train() 方法，或者在 train 方法中实现 epochs 参数，以便进行多轮训练，观察准确率随时间的变化。
调整 bias 参数：尝试将 bias 设置为 0.5 或其他值，看看是否能进一步提升性能。
改变网络结构：修改 network_structure 列表，例如增加更多隐藏层，或者改变隐藏层的节点数，观察其对准确率的影响。
实现混淆矩阵：将之前混淆矩阵 (confusion_matrix)、精确率 (precision) 和召回率 (recall) 的方法集成到这个新的 NeuralNetwork 类中，以便更详细地分析模型性能。

通过这些实验，您将更好地理解神经网络的超参数和结构如何影响其学习能力。

We will write a new neural network class, in which we can define an arbitrary number of hidden layers. The
code is also improved, because the weight matrices are now build inside of a loop instead redundant code:
In [ ]:
import numpy as np
from scipy.special import expit as activation_function
from scipy.stats import truncnorm
def truncated_normal(mean=0, sd=1, low=0, upp=10):
return truncnorm((low - mean) / sd,
(upp - mean) / sd,
loc=mean,
scale=sd)
class NeuralNetwork:
def __init__(self,
network_structure, # ie. [input_nodes, hidden1_no
des, ... , hidden_n_nodes, output_nodes]
learning_rate,
bias=None
):
self.structure = network_structure
self.learning_rate = learning_rate
self.bias = bias
self.create_weight_matrices()
def create_weight_matrices(self):
bias_node = 1 if self.bias else 0
self.weights_matrices = []
layer_index = 1
no_of_layers = len(self.structure)
while layer_index < no_of_layers:
nodes_in = self.structure[layer_index-1]
227
e))
r
e:
defnodes_out = self.structure[layer_index]
n = (nodes_in + bias_node) * nodes_out
rad = 1 / np.sqrt(nodes_in)
X = truncated_normal(mean=2,
sd=1,
low=-rad,
upp=rad)
wm = X.rvs.reshape((nodes_out, nodes_in + bias_nod
self.weights_matrices.append(wm)
layer_index += 1
train(self, input_vector, target_vector):
"""
input_vector and target_vector can be tuple,
list or ndarray
"""
no_of_layers = len(self.structure)
input_vector = np.array(input_vector, ndmin=2).T
layer_index = 0
# The output/input vectors of the various layers:
res_vectors = [input_vector]
while layer_index < no_of_layers - 1:
in_vector = res_vectors[-1]
if self.bias:
# adding bias node to the end of the 'input'_vecto
in_vector = np.concatenate( (in_vector,
[[self.bias]]) )
res_vectors[-1] = in_vector
x = np.dot(self.weights_matrices[layer_index],
in_vector)
out_vector = activation_function(x)
# the output of one layer is the input of the next on
res_vectors.append(out_vector)
layer_index += 1
layer_index = no_of_layers - 1
target_vector = np.array(target_vector, ndmin=2).T
# The input vectors to the various layers
output_errors = target_vector - out_vector
228
while layer_index > 0:
out_vector = res_vectors[layer_index]
in_vector = res_vectors[layer_index-1]
if self.bias and not layer_index==(no_of_layers-1):
out_vector = out_vector[:-1,:].copy()
r)
tmp = output_errors * out_vector * (1.0 - out_vecto
tmp = np.dot(tmp, in_vector.T)
#if self.bias:
#
tmp = tmp[:-1,:]
self.weights_matrices[layer_index-1] += self.learnin
g_rate * tmp
ex-1].T,
output_errors = np.dot(self.weights_matrices[layer_ind
output_errors)
if self.bias:
output_errors = output_errors[:-1,:]
layer_index -= 1
def run(self, input_vector):
# input_vector can be tuple, list or ndarray
no_of_layers = len(self.structure)
if self.bias:
# adding bias node to the end of the inpuy_vector
input_vector = np.concatenate( (input_vector,
[self.bias]) )
in_vector = np.array(input_vector, ndmin=2).T
layer_index = 1
# The input vectors to the various layers
while layer_index < no_of_layers:
x = np.dot(self.weights_matrices[layer_index-1],
in_vector)
out_vector = activation_function(x)
# input vector for next layer
229
)
in_vector = out_vector
if self.bias:
in_vector = np.concatenate( (in_vector,
[[self.bias]])
layer_index += 1
return out_vector
def evaluate(self, data, labels):
corrects, wrongs = 0, 0
for i in range(len(data)):
res = self.run(data[i])
res_max = res.argmax()
if res_max == labels[i]:
corrects += 1
else:
wrongs += 1
return corrects, wrongs
In [ ]:
ANN = NeuralNetwork(network_structure=[image_pixels, 50, 50, 10],
learning_rate=0.1,
bias=None)
for i in range(len(train_imgs)):
ANN.train(train_imgs[i], train_labels_one_hot[i])
In [ ]:
corrects, wrongs = ANN.evaluate(train_imgs, train_labels)
print("accuracy train: ", corrects / ( corrects + wrongs))
corrects, wrongs = ANN.evaluate(test_imgs, test_labels)
print("accuracy: test", corrects / ( corrects + wrongs))