Kaggle实验:房价预测(详细注释) + pytorch GPU测试

发布于 2022-07-16  1158 次阅读


学习过程记录一下
import torch
from torch.nn import MSELoss,Linear,init
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset,DataLoader
import time
from matplotlib import pyplot as plt
import matplotlib_inline.backend_inline

begin = time.time()
#默认浮点型
torch.set_default_tensor_type(torch.FloatTensor)
#读取数据
train_data = pd.read_csv("Datasets/experiment-house_prices_advanced_regression/train.csv")
test_data = pd.read_csv("Datasets/experiment-house_prices_advanced_regression/test.csv")
#切片操作,train_data的最后一个col为价格, 单独分开作为最后y的真值(label)
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
#分出所有数字项的下标
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
#标准化所有的数字项,每一个row的所有项的和为0, x=x - x(均值) / 标准差
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# 标准化后,每个数值特征的均值变为0,所以可以直接用0来替换缺失值
all_features[numeric_features] = all_features[numeric_features].fillna(0)
#处理离散的独热编码
all_features = pd.get_dummies(all_features, dummy_na=True)
#训练的样本数
n_train = train_data.shape[0]
#分配训练集 测试集 和 真值
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(train_data.SalePrice.values, dtype=torch.float).view(-1, 1)
#均方误差损失函数
loss = MSELoss()
#初始化神经网络
def get_net(feature_num):
    net = Linear(feature_num,1)
    #将优化参数初始化, 期望为0, 标准差0.01
    for param in net.parameters():
        init.normal_(param,mean=0,std=0.01)
    return net
#对数均方根误差计算函数
def log_rmse(net, features, labels):
    with torch.no_grad():
        # 将小于1的值设成1,使得取对数时数值更稳定
        clipped_preds = torch.max(net(features), torch.tensor(1.0))
        rmse = torch.sqrt(loss(clipped_preds.log(), labels.log()))
    return rmse.item()

#作图模块
def use_svg_display():
    matplotlib_inline.backend_inline.set_matplotlib_formats()

def set_figsize(figsize = (3.5,2.5)):
    use_svg_display()
    plt.rcParams['figure.figsize'] = figsize

def semilogy(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None,
             legend=None, figsize=(3.5, 2.5)):
    set_figsize(figsize)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.semilogy(x_vals, y_vals)
    if x2_vals and y2_vals:
        plt.semilogy(x2_vals, y2_vals, linestyle=':')
        plt.legend(legend)
#作图模块结束

#训练函数
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    #定义训练损失值和测试损失值
    train_ls, test_ls = [], []
    #读入训练的迭代器
    dataset = TensorDataset(train_features, train_labels)
    train_iter = DataLoader(dataset, batch_size, shuffle=True)
    # 使用Adam优化算法
    optimizer = torch.optim.Adam(params=net.parameters(), lr=learning_rate, weight_decay=weight_decay) 
    
    net = net.float()
    
    for epoch in range(num_epochs):
        #进行Adam优化
        for X, y in train_iter:
            l = loss(net(X.float()), y.float())
            #梯度清零
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        #保存训练对数均方根误差
        train_ls.append(log_rmse(net, train_features, train_labels))
        #保存测试对数均方根误差
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls
    
#K折交叉验证函数(可选可注释):
#返回第i折交叉验证时所需要的训练和验证数据
def get_k_fold_data(k, i, X, y):
    assert k > 1
    #分为K折
    fold_size = X.shape[0] // k
    #初始化
    X_train, y_train = None, None
    #提取每折的训练和验证数据
    for j in range(k):
        #定义切片, 取出一折
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        #取出第i折作为验证折
        if j == i:
            X_valid, y_valid = X_part, y_part
        #取出训练折
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
            
    return X_train, y_train, X_valid, y_valid
#训练K次并返回训练和验证的平均误差
def k_fold(k, X_train, y_train, num_epochs,
           learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net(X_train.shape[1])
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
                         range(1, num_epochs + 1), valid_ls,
                         ['train', 'valid'])
        print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k
#K折交叉验证函数结束

#定义超参数
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 36
#训练, 预测并保存结果
def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    #初始化神经网络
    net = get_net(train_features.shape[1])
    #计算训练误差
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    #作图观察
    semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')
    print('train rmse %f' % train_ls[-1])
    #计算预测值
    preds = net(test_features).detach().numpy()
    #输出至csv文件, 便于上传
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('./submission.csv', index=False)

train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)
#程序结束
end = time.time()
print("the time is %lf" % (end - begin))
#展示所作图
plt.show()
写注释是个好习惯
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
from torchsummary import summary
import time

def main():
    #GPU        
    # 创建神经网络
    class CNN(nn.Module):
        def __init__(self):
            super(CNN, self).__init__()
            self.layer1 = nn.Sequential(
                nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2),
            )
            self.layer2 = nn.Sequential(
                nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2)
            )
            self.output_layer = nn.Linear(32*7*7, 10)
            
        def forward(self, x):
            x = self.layer1(x)
            x = self.layer2(x)
            x = x.reshape(x.size(0), -1)
            output = self.output_layer(x)
            return output

    # 超参数
    EPOCH = 2
    BATCH_SIZE = 100
    LR = 0.001
    DOWNLOAD = True # 若已经下载mnist数据集则设为False

    # 下载mnist数据
    train_data = datasets.FashionMNIST(
        root='./data', # 保存路径
        train=True, # True表示训练集,False表示测试集
        transform=transforms.ToTensor(), # 将0~255压缩为0~1
        download=DOWNLOAD
    )

    # 旧的写法
    print(train_data.train_data.size())
    print(train_data.train_labels.size())

    # 新的写法
    print(train_data.data.size())
    print(train_data.targets.size())

    # 打印部分数据集的图片
    for i in range(2):
        print(train_data.targets[i].item())
        plt.imshow(train_data.data[i].numpy(), cmap='gray')
        plt.show()
        
    # DataLoader
    train_loader = Data.DataLoader(
        dataset=train_data,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=2
    )

    # 如果train_data下载好后,test_data也就下载好了
    test_data = datasets.FashionMNIST(
        root='./data',
        train=False
    )

    print(test_data.data.size())
    print(test_data.targets.size())

    # 新建网络
    cnn = CNN()
    # 将神经网络移到GPU上
    cnn.cuda()
    print(cnn)

    # 查看网络的结构
    model = CNN()
    if torch.cuda.is_available():
        model.cuda()
    summary(model, input_size=(1,28,28))

    # 优化器
    optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)

    # 损失函数
    loss_func = nn.CrossEntropyLoss()

    # 为了节约时间,只使用测试集的前2000个数据
    with torch.no_grad():    
        test_x = Variable(
            torch.unsqueeze(test_data.data, dim=1),
        ).type(torch.FloatTensor)[:2000]/255 # 将将0~255压缩为0~1

    test_y = test_data.targets[:2000]

    # # 使用所有的测试集
    # test_x = Variable(
    #     torch.unsqueeze(test_data.test_data, dim=1),
    #     volatile=True
    # ).type(torch.FloatTensor)/255 # 将将0~255压缩为0~1

    # test_y = test_data.test_labels

    # 将测试数据移到GPU上
    test_x = test_x.cuda()
    test_y = test_y.cuda()

    # 开始计时
    start = time.time()

    # 训练神经网络
    for epoch in range(EPOCH):
        for step, (batch_x, batch_y) in enumerate(train_loader):
            # 将训练数据移到GPU上
            batch_x = batch_x.cuda()
            batch_y = batch_y.cuda()
            output = cnn(batch_x)
            loss = loss_func(output, batch_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # 每隔50步输出一次信息
            if step%50 == 0:
                test_output = cnn(test_x)
                # 将预测结果移到GPU上
                predict_y = torch.max(test_output, 1)[1].cuda().data.squeeze()
                accuracy = (predict_y == test_y).sum().item() / test_y.size(0)
                print('Epoch', epoch, '|', 'Step', step, '|', 'Loss', loss.data.item(), '|', 'Test Accuracy', accuracy)

    # 结束计时
    end = time.time()

    # 训练耗时
    print('Time cost:', end - start, 's')

    # 预测
    test_output = cnn(test_x[:100])
    # 为了将CUDA tensor转化为numpy,需要将数据移回CPU上
    # 否则会报错:TypeError: can't convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
    predict_y = torch.max(test_output, 1)[1].cpu().data.numpy().squeeze()
    real_y = test_y[:100].cpu().numpy()
    print(predict_y)
    print(real_y)

    # 打印预测和实际结果
    for i in range(10):
        print('Predict', predict_y[i])
        print('Real', real_y[i])
        plt.imshow(test_data.data[i].numpy(), cmap='gray')
        plt.show()

    #CPU   
    # 创建神经网络
    class CNN(nn.Module):
        def __init__(self):
            super(CNN, self).__init__()
            self.layer1 = nn.Sequential(
                nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2),
            )
            self.layer2 = nn.Sequential(
                nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2)
            )
            self.output_layer = nn.Linear(32*7*7, 10)
        
        def forward(self, x):
            x = self.layer1(x)
            x = self.layer2(x)
            x = x.reshape(x.size(0), -1)
            output = self.output_layer(x)
            return output

    # 超参数
    EPOCH = 2
    BATCH_SIZE = 100
    LR = 0.001
    DOWNLOAD = False # 若已经下载mnist数据集则设为False

    # 下载mnist数据
    train_data = datasets.FashionMNIST(
        root='./data', # 保存路径
        train=True, # True表示训练集,False表示测试集
        transform=transforms.ToTensor(), # 将0~255压缩为0~1
        download=DOWNLOAD
    )

    # 旧的写法
    print(train_data.train_data.size())
    print(train_data.train_labels.size())

    # 新的写法
    print(train_data.data.size())
    print(train_data.targets.size())

    # 打印部分数据集的图片
    for i in range(2):
        print(train_data.targets[i].item())
        plt.imshow(train_data.data[i].numpy(), cmap='gray')
        plt.show()
        
    # DataLoader
    train_loader = Data.DataLoader(
        dataset=train_data,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=2
    )

    # 如果train_data下载好后,test_data也就下载好了
    test_data = datasets.FashionMNIST(
        root='./data',
        train=False
    )

    print(test_data.data.size())
    print(test_data.targets.size())

    # 新建网络
    cnn = CNN()
    print(cnn)

    # 查看网络的结构
    model = CNN()
    if torch.cuda.is_available():
        model.cuda()
    summary(model, input_size=(1,28,28))

    # 优化器
    optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)

    # 损失函数
    loss_func = nn.CrossEntropyLoss()

    # 为了节约时间,只使用测试集的前2000个数据
    with torch.no_grad():    
        test_x = Variable(
            torch.unsqueeze(test_data.data, dim=1),
        ).type(torch.FloatTensor)[:2000]/255 # 将将0~255压缩为0~1

    test_y = test_data.targets[:2000]

    # # 使用所有的测试集
    # test_x = Variable(
    #     torch.unsqueeze(test_data.test_data, dim=1),
    #     volatile=True
    # ).type(torch.FloatTensor)/255 # 将将0~255压缩为0~1

    # test_y = test_data.test_labels

    # 开始计时
    start = time.time()

    # 训练神经网络
    for epoch in range(EPOCH):
        for step, (batch_x, batch_y) in enumerate(train_loader):
            output = cnn(batch_x)
            loss = loss_func(output, batch_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # 每隔50步输出一次信息
            if step%50 == 0:
                test_output = cnn(test_x)
                predict_y = torch.max(test_output, 1)[1].data.squeeze()
                accuracy = (predict_y == test_y).sum().item() / test_y.size(0)
                print('Epoch', epoch, '|', 'Step', step, '|', 'Loss', loss.data.item(), '|', 'Test Accuracy', accuracy)
                
    # 结束计时
    end = time.time()

    # 训练耗时
    print('Time cost:', end - start, 's')

    # 预测
    test_output = cnn(test_x[:100])
    predict_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
    real_y = test_y[:100].numpy()
    print(predict_y)
    print(real_y)

    # 打印预测和实际结果
    for i in range(10):
        print('Predict', predict_y[i])
        print('Real', real_y[i])
        plt.imshow(test_data.data[i].numpy(), cmap='gray')
        plt.show()

if __name__ == '__main__':
    main()
最后我的电脑测试出来大概是GPU 16s CPU 38s的样子

海纳百川 有容乃大