智能计算系统 lab3.1 深度学习

实验目的

使用Python语言numpy模块基于VGG19网络模型实现ImageNet数据集分类
优化：img2col改善计算性能

这篇文章重点记录了原始前向传播和优化的过程与理解

实验内容

ConvolutionalLayer

简单前向传播：

def forward_slow(self, _input):
    
    # input格式 [batchN, Channel, Height, Width]
    self.input = _input
    
    # 计算Padding后大小
    height = self.input.shape[2] + self.padding*2
    width = self.input.shape[3] + self.padding*2
    
    # 创建PaddingInput
    self.input_pad = np.zeros([self.input.shape[0], self.input.shape[1], height, width])
    self.input_pad[:, :, self.padding:self.padding+self.input.shape[2], self.padding:self.padding+self.input.shape[3]] = self.input
    
    # 计算输出矩阵大小并创建0矩阵
    height_out = int((height - self.kernel_size) / self.stride + 1)
    width_out = int((width - self.kernel_size) / self.stride + 1)
    self.output = np.zeros([self.input.shape[0], self.channel_out, height_out, width_out])
    
    # 四重循环计算输出矩阵每一个单元卷积后的值（主要的优化点）
    for idxn in range(self.input.shape[0]):
        for idxc in range(self.channel_out):
            for idxh in range(height_out):
                for idxw in range(width_out):
                    h = idxh * self.stride
                    w = idxw * self.stride
                    self.output[idxn, idxc, idxh, idxw] = np.sum(self.weight[:, :, :, idxc] * self.input_pad[idxn, :, h:h + self.kernel_size, w:w + self.kernel_size]) + self.bias[idxc]
    return self.output

优化思路：

img2col+gemm
- 通过将输入特征和卷积核扁平化（类似于扁平化？），通过一个矩阵乘法计算出整一层的卷积结果
- 这个方法非常耗费内存（显存）？
- 图片来自*High Performance Convolutional Neural Networks for Document Processing*

WinoGrad，TODO

优化结果（大部分来自参考文章）：

def im2col(image, ksize, stride):
    # image is a 4d tensor([batchsize, channel, height, width])
    image_col = []
    for b in range(image.shape[0]):
        for i in range(0, image.shape[2] - ksize + 1, stride):
            for j in range(0, image.shape[3] - ksize + 1, stride):
                col = image[b, :, i:i + ksize, j:j + ksize].reshape([-1])
                image_col.append(col)
    image_col = np.array(image_col)
    return image_col

def forward_fast(self, _input):
    # 同上
    self.input = _input
    height = self.input.shape[2] + self.padding * 2
    width = self.input.shape[3] + self.padding * 2
    self.input_pad = np.zeros([self.input.shape[0], self.input.shape[1], height, width])
    self.input_pad[:, :, self.padding:self.padding + self.input.shape[2], self.padding:self.padding + self.input.shape[3]] = self.input
    height_out = int((height - self.kernel_size) / self.stride + 1)
    width_out = int((width - self.kernel_size) / self.stride + 1)
    self.output = np.zeros([self.input.shape[0], self.channel_out, height_out, width_out])

    cin = self.input.shape[0]     # Channel In
    cout = self.weight.shape[3]   # Channel Out 
    col_weight = np.reshape(self.weight, [-1, cout])
    # col reshape
    self.col_image = im2col(self.input_pad, self.kernel_size, self.stride)

    # matrix multiply
    self.output = np.dot(self.col_image, col_weight) + self.bias
    # reshape to output
    self.output = np.reshape(self.output, np.hstack(([cin], [height_out], [width_out], [cout])))
    self.output = np.transpose(self.output, [0, 3, 1, 2])
    return self.output

MaxPoolingLayer

简单前向传播：

def forward_slow(self, _input):	
    self.input = _input
    self.max_index = np.zeros(self.input.shape)
    height_out = int((self.input.shape[2] - self.kernel_size) / self.stride + 1)
    width_out = int((self.input.shape[3] - self.kernel_size) / self.stride + 1)
    self.output = np.zeros([self.input.shape[0], self.input.shape[1], height_out, width_out])
    for idxn in range(self.input.shape[0]):
        for idxc in range(self.input.shape[1]):
            for idxh in range(height_out):
                for idxw in range(width_out):
                    h = idxh * self.stride
                    w = idxw * self.stride
                    self.output[idxn, idxc, idxh, idxw] = np.max(self.input[idxn, idxc, h:h + self.kernel_size, w:w + self.kernel_size])
                    return self.output

这部分的优化较为简单，把所有求MaxPool需要的input的子矩阵提取出来之后再利用numpy的max函数的并行优化快速求最大值。缺点同样是内存开支较大

快速前向传播：

def forward_fast(self, _input):
        self.input = _input
        self.max_index = np.zeros(self.input.shape)
        height_out = int((self.input.shape[2] - self.kernel_size) / self.stride + 1)
        width_out = int((self.input.shape[3] - self.kernel_size) / self.stride + 1)
        self.input_vectorized = np.zeros([self.input.shape[0], self.input.shape[1],
                                          height_out * width_out, self.kernel_size * self.kernel_size])
        for idxh in range(height_out):
            for idxw in range(width_out):
                roi = self.input[:, :,
                      idxh * self.stride:idxh * self.stride + self.kernel_size,
                      idxw * self.stride:idxw * self.stride + self.kernel_size]
                self.input_vectorized[:, :, idxh * width_out + idxw] = roi.reshape([roi.shape[0], roi.shape[1], -1])
        self.output = np.max(self.input_vectorized, axis=-1).reshape([self.input.shape[0], self.input.shape[1], height_out, width_out])
        return self.output

整体代码

# coding=utf-8
# wyl 2022.4.15
import time
import numpy as np
import scipy.io as sio
from PIL import Image


def im2col(image, ksize, stride):
    # image is a 4d tensor([batchsize, channel, height, width])
    image_col = []
    for b in range(image.shape[0]):
        for i in range(0, image.shape[2] - ksize + 1, stride):
            for j in range(0, image.shape[3] - ksize + 1, stride):
                col = image[b, :, i:i + ksize, j:j + ksize].reshape([-1])
                image_col.append(col)
    image_col = np.array(image_col)
    return image_col


def im2col_pool(image, ksize, stride):
    # image is a 4d tensor([batchsize, channel, height, width])
    image_col = []
    for b in range(image.shape[0]):
        for i in range(0, image.shape[2] - ksize + 1, stride):
            for j in range(0, image.shape[3] - ksize + 1, stride):
                col = image[b, :, i:i + ksize, j:j + ksize].reshape([image.shape[1],-1])
                image_col.append(col)
    image_col = np.array(image_col)
    return image_col


class ConvolutionalLayer(object):
    def __init__(self, kernel_size, channel_in, channel_out, padding, stride):
        self.kernel_size = kernel_size
        self.channel_in = channel_in
        self.channel_out = channel_out
        self.padding = padding
        self.stride = stride

    def init_param(self, std=0.01):
        self.weight = np.random.normal(loc=0.0, scale=std, size=(self.channel_in, self.kernel_size, self.kernel_size, self.channel_out))
        self.bias = np.zeros([self.channel_out])

    def forward(self, _input):
        return self.forward_fast(_input)

    def forward_fast(self, _input):
        self.input = _input

        height = self.input.shape[2] + self.padding * 2
        width = self.input.shape[3] + self.padding * 2
        self.input_pad = np.zeros([self.input.shape[0], self.input.shape[1], height, width])
        self.input_pad[:, :, self.padding:self.padding + self.input.shape[2], self.padding:self.padding + self.input.shape[3]] = self.input
        height_out = int((height - self.kernel_size) / self.stride + 1)
        width_out = int((width - self.kernel_size) / self.stride + 1)
        self.output = np.zeros([self.input.shape[0], self.channel_out, height_out, width_out])

        cin = self.input.shape[0]
        cout = self.weight.shape[3]
        col_weight = np.reshape(self.weight, [-1, cout])
        # col reshape
        self.col_image = im2col(self.input_pad, self.kernel_size, self.stride)

        # matrix multiply
        self.output = np.dot(self.col_image, col_weight) + self.bias
        # reshape to output
        self.output = np.reshape(self.output, np.hstack(([cin], [height_out], [width_out], [cout])))
        self.output = np.transpose(self.output, [0, 3, 1, 2])
        return self.output

    def forward_slow(self, _input):
        self.input = _input
        height = self.input.shape[2] + self.padding*2
        width = self.input.shape[3] + self.padding*2
        self.input_pad = np.zeros([self.input.shape[0], self.input.shape[1], height, width])
        self.input_pad[:, :, self.padding:self.padding+self.input.shape[2], self.padding:self.padding+self.input.shape[3]] = self.input
        height_out = int((height - self.kernel_size) / self.stride + 1)
        width_out = int((width - self.kernel_size) / self.stride + 1)
        self.output = np.zeros([self.input.shape[0], self.channel_out, height_out, width_out])
        for idxn in range(self.input.shape[0]):
            for idxc in range(self.channel_out):
                for idxh in range(height_out):
                    for idxw in range(width_out):
                        h = idxh * self.stride
                        w = idxw * self.stride
                        self.output[idxn, idxc, idxh, idxw] = np.sum(self.weight[:, :, :, idxc] * self.input_pad[idxn, :, h:h + self.kernel_size, w:w + self.kernel_size]) + self.bias[idxc]
        return self.output

    def load_param(self, weight, bias):
        self.weight = weight
        self.bias = bias


class MaxPoolingLayer(object):
    def __init__(self, kernel_size, stride):
        self.kernel_size = kernel_size
        self.stride = stride

    def forward(self, _input):
        return self.forward_fast(_input)

    def forward_fast(self, _input):
        self.input = _input
        self.max_index = np.zeros(self.input.shape)
        height_out = int((self.input.shape[2] - self.kernel_size) / self.stride + 1)
        width_out = int((self.input.shape[3] - self.kernel_size) / self.stride + 1)
        self.input_vectorized = np.zeros([self.input.shape[0], self.input.shape[1],
                                          height_out * width_out, self.kernel_size * self.kernel_size])
        for idxh in range(height_out):
            for idxw in range(width_out):
                roi = self.input[:, :,
                      idxh * self.stride:idxh * self.stride + self.kernel_size,
                      idxw * self.stride:idxw * self.stride + self.kernel_size]
                self.input_vectorized[:, :, idxh * width_out + idxw] = roi.reshape([roi.shape[0], roi.shape[1], -1])
        self.output = np.max(self.input_vectorized, axis=-1).reshape([self.input.shape[0], self.input.shape[1], height_out, width_out])
        return self.output

    def forward_slow(self, _input):
        self.input = _input
        self.max_index = np.zeros(self.input.shape)
        height_out = int((self.input.shape[2] - self.kernel_size) / self.stride + 1)
        width_out = int((self.input.shape[3] - self.kernel_size) / self.stride + 1)
        self.output = np.zeros([self.input.shape[0], self.input.shape[1], height_out, width_out])
        for idxn in range(self.input.shape[0]):
            for idxc in range(self.input.shape[1]):
                for idxh in range(height_out):
                    for idxw in range(width_out):
                        h = idxh * self.stride
                        w = idxw * self.stride
                        self.output[idxn, idxc, idxh, idxw] = np.max(self.input[idxn, idxc, h:h + self.kernel_size, w:w + self.kernel_size])
        return self.output


class FlattenLayer(object):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape

    def forward(self, _input):
        self.input = np.transpose(_input, [0, 2, 3, 1])
        self.output = self.input.reshape([self.input.shape[0]] + list(self.output_shape))
        return  self.output


class FullyConnectedLayer(object):
    def __init__(self, num_input, num_output):
        self.num_input = num_input
        self.num_output = num_output

    def init_param(self, std=0.01):
        self.weight = np.random.normal(loc=0.0, scale=std, size=(self.num_input, self.num_output))
        self.bias = np.zeros([1, self.num_output])

    def forward(self, _input):
        self.input = _input
        output = np.dot(_input, self.weight) + self.bias
        return output

    def backward(self, top_diff):
        self.d_weight = np.dot(self.input.T, top_diff)
        self.d_bias = np.sum(top_diff, axis=0)
        bottom_diff = np.dot(top_diff, self.weight.T)
        return bottom_diff

    def update_param(self, lr):
        self.weight = self.weight - lr * self.d_weight
        self.bias = self.bias - lr * self.d_bias

    def load_param(self, weight, bias):
        assert self.weight.shape == weight.shape
        assert self.bias.shape == bias.shape
        self.weight = weight
        self.bias = bias

    def save_param(self):
        return self.weight, self.bias


class ReLULayer(object):
    def forward(self, _input):
        self.input = _input
        output = np.maximum(self.input, 0)
        return output

    def backward(self, top_diff):
        bottom_diff = top_diff
        bottom_diff[self.input < 0] = 0
        return bottom_diff


class SoftmaxLossLayer(object):
    def forward(self, _input):
        input_max = np.max(_input, axis=1, keepdims=True)
        input_exp = np.exp(_input - input_max)
        self.prob = input_exp / np.sum(input_exp, axis=1, keepdims=True)
        return self.prob

    def get_loss(self, label):
        self.batch_size = self.prob.shape[0]
        self.label_onehot = np.zeros_like(self.prob)
        self.label_onehot[np.arange(self.batch_size), label] = 1.0
        loss = -np.sum(np.log(self.prob)*self.label_onehot) / self.batch_size
        return loss

    def backward(self):
        bottom_diff = (self.prob - self.label_onehot) / self.batch_size
        return bottom_diff


class VGG19(object):
    def __init__(self, param_path='imagenet-vgg-verydeep-19.mat'):
        self.param_path = param_path
        self.param_layer_name = (
            'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1',
            'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2',
            'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'conv3_4', 'relu3_4', 'pool3',
            'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'conv4_4', 'relu4_4', 'pool4',
            'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'conv5_4', 'relu5_4', 'pool5',
            'flatten', 'fc6', 'relu6', 'fc7', 'relu7', 'fc8', 'softmax')

    def build_model(self):
        print('Building model')
        self.layers = {}
        self.layers['conv1_1'] = ConvolutionalLayer(3, 3, 64, 1, 1)
        self.layers['relu1_1'] = ReLULayer()
        self.layers['conv1_2'] = ConvolutionalLayer(3, 64, 64, 1, 1)
        self.layers['relu1_2'] = ReLULayer()
        self.layers['pool1'] = MaxPoolingLayer(2, 2)

        self.layers['conv2_1'] = ConvolutionalLayer(3, 64, 128, 1, 1)
        self.layers['relu2_1'] = ReLULayer()
        self.layers['conv2_2'] = ConvolutionalLayer(3, 128, 128, 1, 1)
        self.layers['relu2_2'] = ReLULayer()
        self.layers['pool2'] = MaxPoolingLayer(2, 2)

        self.layers['conv3_1'] = ConvolutionalLayer(3, 128, 256, 1, 1)
        self.layers['relu3_1'] = ReLULayer()
        self.layers['conv3_2'] = ConvolutionalLayer(3, 256, 256, 1, 1)
        self.layers['relu3_2'] = ReLULayer()
        self.layers['conv3_3'] = ConvolutionalLayer(3, 256, 256, 1, 1)
        self.layers['relu3_3'] = ReLULayer()
        self.layers['conv3_4'] = ConvolutionalLayer(3, 256, 256, 1, 1)
        self.layers['relu3_4'] = ReLULayer()
        self.layers['pool3'] = MaxPoolingLayer(2, 2)

        self.layers['conv4_1'] = ConvolutionalLayer(3, 256, 512, 1, 1)
        self.layers['relu4_1'] = ReLULayer()
        self.layers['conv4_2'] = ConvolutionalLayer(3, 512, 512, 1, 1)
        self.layers['relu4_2'] = ReLULayer()
        self.layers['conv4_3'] = ConvolutionalLayer(3, 512, 512, 1, 1)
        self.layers['relu4_3'] = ReLULayer()
        self.layers['conv4_4'] = ConvolutionalLayer(3, 512, 512, 1, 1)
        self.layers['relu4_4'] = ReLULayer()
        self.layers['pool4'] = MaxPoolingLayer(2, 2)

        self.layers['conv5_1'] = ConvolutionalLayer(3, 512, 512, 1, 1)
        self.layers['relu5_1'] = ReLULayer()
        self.layers['conv5_2'] = ConvolutionalLayer(3, 512, 512, 1, 1)
        self.layers['relu5_2'] = ReLULayer()
        self.layers['conv5_3'] = ConvolutionalLayer(3, 512, 512, 1, 1)
        self.layers['relu5_3'] = ReLULayer()
        self.layers['conv5_4'] = ConvolutionalLayer(3, 512, 512, 1, 1)
        self.layers['relu5_4'] = ReLULayer()
        self.layers['pool5'] = MaxPoolingLayer(2, 2)

        self.layers['flatten'] = FlattenLayer([512, 7, 7], [512*7*7])
        self.layers['fc6'] = FullyConnectedLayer(512*7*7, 4096)
        self.layers['relu6'] = ReLULayer()
        self.layers['fc7'] = FullyConnectedLayer(4096, 4096)
        self.layers['relu7'] = ReLULayer()
        self.layers['fc8'] = FullyConnectedLayer(4096, 1000)
        self.layers['softmax'] = SoftmaxLossLayer()

        self.update_layer_list = []
        for layer_name in self.layers.keys():
            if 'conv' in layer_name or 'fc' in layer_name:
                self.update_layer_list.append(layer_name)

    def init_model(self):
        print('Initing model')
        for layer_name in self.update_layer_list:
            self.layers[layer_name].init_param()

    def load_model(self):
        print('Loading model')
        params = sio.loadmat(self.param_path)
        self.image_mean = params['normalization'][0][0][0]
        self.image_mean = np.mean(self.image_mean, axis=(0, 1))
        for idx in range(43):
            if 'conv' in self.param_layer_name[idx]:
                weight, bias = params['layers'][0][idx][0][0][0][0]
                weight = np.transpose(weight, [2, 0, 1, 3])
                bias = bias.reshape(-1)
                self.layers[self.param_layer_name[idx]].load_param(weight, bias)
            if idx >= 37 and 'fc' in self.param_layer_name[idx]:
                weight, bias = params['layers'][0][idx-1][0][0][0][0]
                weight = weight.reshape([weight.shape[0]*weight.shape[1]*weight.shape[2], weight.shape[3]])
                self.layers[self.param_layer_name[idx]].load_param(weight, bias)

    def forward(self):
        current = self.input_image
        for idx in range(len(self.param_layer_name)):
            print('Forwarding in layer: %s' % self.param_layer_name[idx], end="")
            start_time = time.time()
            current = self.layers[self.param_layer_name[idx]].forward(current)
            end_time = time.time()
            print(' time:%.2fs' % (end_time-start_time))

        return current

    def evaluate(self):
        prob = self.forward()
        top1 = np.argmax(prob[0])
        print('Classification result: id=%d, prob=%f' % (top1, prob[0, top1]))

    def load_image(self, image_dir):
        print('Loading and preprocessing image from ' + image_dir)
        self.input_image = Image.open(image_dir).resize((224, 224))
        self.input_image = np.array(self.input_image).astype(np.float32)
        self.input_image -= self.image_mean
        self.input_image = np.reshape(self.input_image, [1]+list(self.input_image.shape))
        self.input_image = np.transpose(self.input_image, [0, 3, 1, 2])


if __name__ == '__main__':
    vgg = VGG19()
    vgg.build_model()
    vgg.init_model()
    vgg.load_model()
    vgg.load_image('c67ab7b456b55a4cba9f6e2956b7c2ef.jpeg')
    vgg.evaluate()