pytorch学习笔记

0.待研究问题

1.Pytorch optimizer.step() 和loss.backward()和scheduler.step()的关系与区别 

1.线性回归

1.小批量随机梯度下降的理解:

(样本可以很好的体现整体的性质,不必对于整个进行计算,对于样本的计算结果的整体性也够用,如均值)

是对数据集进行随机均匀采样获取小批量的样本,获取这个小批量样本的平均损失的梯度,再将这个梯度用于整体参数的优化上

 B是小批量计算中的批量大小batch size

2.计算时间

# define a timer class to record time
class Timer(object):
    """Record multiple running times."""
    def __init__(self):
        self.times = []
        self.start()

    def start(self):
        # start the timer
        self.start_time = time.time()

    def stop(self):
        # stop the timer and record time into a list
        self.times.append(time.time() - self.start_time)
        return self.times[-1]

    def avg(self):
        # calculate the average and return
        return sum(self.times)/len(self.times)

    def sum(self):
        # return the sum of recorded time
        return sum(self.times)

3.pytorch的使用

features = torch.randn(num_examples,num_inputs,dtype=torch.float32) #生成num_examples行,num_inputs列的数组,其features的类型为torch.Tensor

labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()),
                       dtype=torch.float32)#利用np生成正态干扰
plt.scatter(features[:, 1].numpy(), labels.numpy(), 5);#对于Tensor的数据类型的生成图像,需要用.numpy()进行处理
features.size()#torch.Size([1000, 2])
len(features)#1000

indices = list(range(100))
random.shuffle(indices)#将indices的list进行随机化

b = torch.zeros(1, dtype=torch.float32)
b.requires_grad_(requires_grad=True)#为了方便追踪求导

#调用父类构造方法
class Conv2D(nn.Module):
    def __init__(self,kernel_size):
        super(Conv2D,self).___init__()#调用父类的__init__构造函数
        self.weight = nn.Parameter(torch.randn(kernel_size))
        self.bais = nn.Parameter(torch.randn(1))

    def forward(self,x):
        return corr2d(x,self.weight) + self.bias

4.将features的tensor流,每次取出batch_size个数

def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    random.shuffle(indices)  # random read 10 samples
    for i in range(0, num_examples, batch_size):#batch_size表示间隔[1,4,7,10]如此
        j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) #每次将1-4这里的所有的数输出 the last time may be not enough for a whole batch
        yield  features.index_select(0, j), labels.index_select(0, j)#其中yield相当于一个生成器,return + 指针,下次接着这里运行
        # .index_select函数相当于按照0表示index,而j,torch.LongTensor[1,2,34,54]的类型

batch_size = 10
for X, y in data_iter(batch_size, features, labels):
    print(X, '\n', y)
    break#打断了yield的使用

5.均方误差损失函数

def squared_loss(y_hat, y): 
    return (y_hat - y.view(y_hat.size())) ** 2 / 2#其中的.view相当于进行重新的行列排序

6.小批量随机梯度下降

def sgd(params,lr,batch_size):
    for param in params:
        param.data -= lr * param.grad /batch_size

7.训练线性模型

# super parameters init
lr = 0.03
num_epochs = 5

net = linreg
loss = squared_loss

# training
for epoch in range(num_epochs):  # training repeats num_epochs times
    # in each epoch, all the samples in dataset will be used once

    # X is the feature and y is the label of a batch sample
    for X, y in data_iter(batch_size, features, labels):
        l = loss(net(X, w, b), y).sum()  
        # calculate the gradient of batch sample loss 
        l.backward()  
        # using small batch random gradient descent to iter model parameters
        sgd([w, b], lr, batch_size)  
        # reset parameter gradient
        w.grad.data.zero_()
        b.grad.data.zero_()
    train_l = loss(net(features, w, b), labels)
    print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))

2.Softmax与分类模型

1.使用交叉损失函数的原因

在分类问题中,x1 = 0.6,x2 = 0.2,x3 = 0.2,与x1 = 0.6,x2 = 0.4,x3 = 0上的平方损失,前面比后面的小很多,其实都是与1,0,0进行比较,但是这样在分类问题中太过严格

 而其实质只有一个y=1的作为损失函数

训练的目的:是使得正确类的预测结果更加大

4.文本预处理

5.语言模型

6.过拟合、欠拟合及其解决方案

1.过拟合的原因

把训练数据吃的很透,1.训练数据太少,2.模型太过于复杂

2.模型复杂度与误差的关系

3.生成训练数据

n_train, n_test, true_w, true_b = 100, 100, [1.2, -3.4, 5.6], 5
features = torch.randn((n_train + n_test, 1))
poly_features = torch.cat((features, torch.pow(features, 2), torch.pow(features, 3)), 1) 
labels = (true_w[0] * poly_features[:, 0] + true_w[1] * poly_features[:, 1]
          + true_w[2] * poly_features[:, 2] + true_b)
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)

4.关于模型训练的函数

#num_epochs, loss = 100, torch.nn.MSELoss()

num_epochs,loss=100,torch.nn.MSELoss()#表示均方误差
def fit_and_plot(train_features,test_features,train_labels,test_labels):
    net = torch.nn.Linear(train_features.shape[-1],1)#这里相当于y = w*x这里,(train_features.shape[-1],1)是在初始化w
    batch_size = min(10,train_labels.shape[0]
    dataset = torch.utils.data.TensorDataset(train_features,train_labels)#将tensor的数据转化成TensorDataset的形式
    train_iter = torch.utils.data.DataLoader(dataset,batch_size,shuffle=True)#设置获取数

    optimizer = torch.optim.SGD(net.parameters(),lr=0.01)#这里梯度下降的参数是w
    train_ls,test_ls = [],[]

    for _ in range(num_epochs):
#################################################################
##这里关于backward和step函数的逻辑理解,他们是一体的吗
        for X,y in train_iter:
            l = loss(net(X),y.view(-1,1))#view相当于resize
            optimizer.zero_grad()#梯度清零
            l.backward()#求梯度
            optimizer.step()#迭代优化函数
 ###############################################################
        train_labels = train_labels.view(-1,1)
        test_labels = test_labels.view(-1,1)
        train_ls.append(loss(net(train_features),train_labels).item())#.item只能用于提取一个元素张量的那个元素值
        test_ls.append(loss(net(test_features),test_labels).item())
    print("final epoch:train loss",train_ls[-1],'test loss',test_ls[-1])
    semilogy(range(1,num_epochs + 1),train_ls,'epochs','loss',
             range(1,num_epochs + 1),test_ls,['train','test'])
    print('weight:',net.weight.data,'\nbias:',net.bias.data)

5.权重衰减(L2范数)

def fit_and_plot_pytorch(wd):
    # 对权重参数衰减。权重名称一般是以weight结尾
    net = nn.Linear(num_inputs, 1)
    nn.init.normal_(net.weight, mean=0, std=1)
    nn.init.normal_(net.bias, mean=0, std=1)
    optimizer_w = torch.optim.SGD(params=[net.weight], lr=lr, weight_decay=wd) # 对权重参数衰减
    optimizer_b = torch.optim.SGD(params=[net.bias], lr=lr)  # 不对偏差参数衰减

def fit_and_plot_pytorch(wd):#wd表示的是衰减的程度
    net = nn.Linear(num_inputs,1)#这里的参数矩阵为[100,1],并不是感知机的,这里是线性模型
    #初始化权重
    nn.init.normal_(net.weight,mean=0,std=1)
    nn.init.normal_(net.bias,mean=0,std=1)
    #优化函数类,weight_decay,进行权重衰减,是用于计算的
    optimizer_w = torch.optim.SGD(params=[net.weight],lr=lr,weight_decay=wd)
    optimizer_b = torch.optim.SGD(params=[net.bias],lr=lr)

    train_ls,test_ls = [],[]
    for _ in range(num_epochs):
        for X,y in train_iter:
            l = loss(net(X),y).mean()
            optimizer_w.zero_grad()
            optimizer_b.zero_grad()

            l.backward()#求梯度,函数梯度

             # 对两个optimizer实例分别调用step函数,从而分别更新权重和偏差,数值梯度
            optimizer_w.step()
            optimizer_b.step()
        train_ls.append(loss(net(train_features),train_labels).mean().item())
        test_ls.append(loss(net(test_features),test_labels).mean().item())
    d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
                 range(1, num_epochs + 1), test_ls, ['train', 'test'])
    print('L2 norm of w:', net.weight.data.norm().item())

6.丢弃法

降低过拟合

def dropout(X, drop_prob):
    X = X.float()
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    # 这种情况下把全部元素都丢弃
    if keep_prob == 0:
        return torch.zeros_like(X)
    mask = (torch.rand(X.shape) < keep_prob).float()##保证了有 keep_prob个0,其他的为1

    return mask * X / keep_prob

def evaluate_accuracy(data_iter,net):
    acc_sum,n = 0.0,0
    for X,y in data_iter:
        if isinstance(net,torch.nn.Module):#类型判断
            net.eval() # 评估模式, 这会关闭dropout
            acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()#train_ls.append(loss(net(train_features),train_labels).mean().item())相似
            net.train()   # 改回训练模式         
        else:# 自定义的模型
            if("is_training" in net.__code__.co_varnames):# 如果有is_training这个参数
                # 将is_training设置成False
                acc_sum += (net(X,is_training=False).argmax(dim = 1) == y).float().sum().item()
            else:
                 acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 

        n += y.shape[0]
    return acc_sum / n

18.卷积神经网络进阶

AlexNet网络

2.VGG block

其中卷积层,保持输入的长宽不变,池化层会使得长宽减半(池化层为2*2)

在发展中为了更加规范化开发,使用了VGG块

3.NiN block

使用卷积层+两层“全连接层”组成,其中为了不reshape所以使用卷积进行

VGG通过全连接层来控制结果数

NiN最后输出的一阶段是通过卷积层的通道数,来控制结果数

19.梯度消失、梯度爆炸

在激活函数的选择的地方讲过,在深层网络中尽量避免选择sigmoid和tanh激活函数,原因是这两个激活函数会把元素转换到[0, 1]和[-1, 1]之间,会加剧梯度消失的现象

发表评论

电子邮件地址不会被公开。 必填项已用*标注