AI基础 | FSYo

手写全连接

class Network(object):
    def __init__(self, input_size, hidden_size, output_size, lr):
        self.A1 = init_weights((input_size, hidden_size))
        self.b1 = init_weights((hidden_size)) 
        self.A2 = init_weights((hidden_size, output_size)) 
        self.b2 = init_weights((output_size)) 
        self.lr = lr 
    def check (self, x) :
        z1 = np.matmul(x, self.A1) + self.b1 
        a1 = relu(z1) 
        z2 = np.matmul(a1, self.A2) + self.b2 
        return np.argmax(z2) 
    def step(self, x_batch, y_batch):
        batch_size = len(x_batch)
        batch_loss = 0
        batch_acc  = 0
        z1 = np.matmul(x_batch, self.A1) + self.b1 # 64 * 784, 784 * 128 -> 64 * 128 
        a1 = relu(z1)
        z2 = np.matmul(a1, self.A2) + self.b2 # 64 * 10 
        a2 = softmax(z2)    # 64 * 10 
        for i in range(0, batch_size) : 
            a2[i] /= np.sum(a2[i]) 
            batch_loss += - np.log(a2[i][np.argmax(y_batch[i])])
            batch_acc += np.argmax(a2[i]) == np.argmax(y_batch[i]) 
        d_loss = a2 - y_batch # 64 * 10 
        d_b2 = np.sum(d_loss, axis=0) # 64 * 10 
        d_A2 = np.matmul(a1.T, d_loss) # 128 * 64, 64 * 10 -> 128 * 10 
        d_a1 = np.matmul(d_loss, self.A2.T) # 64 * 10, 10 * 128 -> 64 * 128 
        d_z1 = d_a1 * relu_prime(z1) # 64 * 128 
        d_b1 = np.sum(d_z1, axis=0) # 64 * 128 
        d_A1 = np.matmul(x_batch.T, d_z1) # 784 * 64, 64 * 128 -> 784 * 128 
        self.A2 -= self.lr * d_A2 / batch_size 
        self.b2 -= self.lr * d_b2 / batch_size
        self.A1 -= self.lr * d_A1 / batch_size 
        self.b1 -= self.lr * d_b1 / batch_size 
        return [batch_loss, batch_acc]

用 torch 的全连接

数据集：MINST

# 定义模型
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 32)
        self.fc3 = nn.Linear(32, 10)
    def forward(self, x):
        x = x.view(-1, 784)
        x = F.relu(self.fc1(x))     
        x = F.relu(self.fc2(x)) 
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)

model = Net()
criterion = nn.CrossEntropyLoss()

for epoch in range(EPOCHS):
    model.train()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
    val_loss = []
    model.eval()
    with torch.no_grad():
        for data, target in val_loader:
            output = model(data)
            val_loss.append(criterion(output, target).item()) 

CNN

数据集 CIFAR10

transform = transforms.Compose(
    [
    transforms.RandomHorizontalFlip(), # 随机水平翻转
    transforms.RandomRotation(10), # 随机旋转
    # transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5), # 随机颜色变换
    transforms.ToTensor(),
    ]
)
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # VGG 16 
        # 32 * 32 * 3 -> 32 * 32 * 64 
        self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
        # 32 * 32 * 64 -> 32 * 32 * 64
        self.conv2 = nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1, padding = 1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu1 = nn.ReLU()
        # 16 * 16 * 64
        self.conv3 = nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
        self.conv4 = nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 1, padding = 1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.bn2 = nn.BatchNorm2d(128)
        self.relu2 = nn.ReLU()
        # 8 * 8 * 128
        self.conv5 = nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
        self.conv6 = nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
        self.conv7 = nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
        self.pool3 = nn.MaxPool2d(2, 2)
        self.bn3 = nn.BatchNorm2d(256)
        self.relu3 = nn.ReLU()
        # 4 * 4 * 256
        self.fc1 = nn.Linear(256 * 4 * 4, 512)
        self.bn4 = nn.BatchNorm1d(512)
        self.relu4 = nn.ReLU()
        self.fc2 = nn.Linear(512, 128)
        self.bn5 = nn.BatchNorm1d(128)
        self.relu5 = nn.ReLU()
        self.fc3 = nn.Linear(128, 10)
    def forward(self, x) : 
        x = self.relu1(self.bn1(self.pool1(self.conv2(self.conv1(x)))))
        x = self.relu2(self.bn2(self.pool2(self.conv4(self.conv3(x)))))
        x = self.relu3(self.bn3(self.pool3(self.conv7(self.conv6(self.conv5(x))))))
        x = x.view(-1, 256 * 4 * 4)
        x = self.relu4(self.bn4(self.fc1(x)))
        x = self.relu5(self.bn5(self.fc2(x)))
        x = self.fc3(x)
        return x

DCGAN （Deep Convolutional GAN）

数据集：CelebA，代码在 torch 官网上直接可以抄

Generator:

class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.net = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # state size. (ngf) x 32 x 32
            nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
            nn.Tanh()
            # state size. (nc) x 64 x 64
        )
    def forward(self, input):
        return self.net(input)

Discriminator

class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.net = nn.Sequential(
            # input is (nc) x 64 x 64
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 32 x 32
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 16 x 16
            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 8 x 8
            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*8) x 4 x 4
            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )
    def forward(self, input):
        return self.net(input)

for epoch in range(num_epochs):
    # For each batch in the dataloader
    for i, data in enumerate(dataloader, 0):
        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad() 
        # 构成真实数据的标签
        real_cpu = data[0].to(device)
        b_size = real_cpu.size(0)
        label = torch.full((b_size,), real_label, dtype=torch.float, device=device)# 全部填充为1
        # 将真实数据输入到D中，得到输出
        output = netD(real_cpu).view(-1)
        # 在真实数据上计算D的损失
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        ## 使用假数据训练
        # 生成一个batch的随机噪声
        noise = torch.randn(b_size, nz, 1, 1, device=device)
        # 使用G前向生成假图片
        fake = netG(noise) 
        label.fill_(fake_label) # 全部填充为0
        # 使用D判断全假batch
        output = netD(fake.detach()).view(-1)
        # 在全假batch上计算D的损失
        errD_fake = criterion(output, label)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()
        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output = netD(fake).view(-1)
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        # Update G
        optimizerG.step()
        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

RNN，手写 LSTM

做情感分类，数据集 imdb 电影

class LSTM(nn.Module):
    def init_weights(self):
        w = 1.0 / np.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-w, w)
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # f_t
        self.U_f = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.V_f = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_f = nn.Parameter(torch.Tensor(hidden_size))

        # i_t
        self.U_i = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.V_i = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_i = nn.Parameter(torch.Tensor(hidden_size))

        #c_t
        self.U_c = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.V_c = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_c = nn.Parameter(torch.Tensor(hidden_size))

        #o_t
        self.U_o = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.V_o = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_o = nn.Parameter(torch.Tensor(hidden_size))
        
        self.init_weights()

    def forward (self, x) : 
        bs, seq_len, dim = x.size()
        hidden_seq = []
        h_t = h_0
        c_t = c_0
        for t in range(seq_len):
            x_t = x[:, t, :]
            i_t = torch.sigmoid(x_t @ self.U_i + h_t @ self.V_i + self.b_i)
            f_t = torch.sigmoid(x_t @ self.U_f + h_t @ self.V_f + self.b_f)
            g_t = torch.tanh(x_t @ self.U_c + h_t @ self.V_c + self.b_c)
            o_t = torch.sigmoid(x_t @ self.U_o + h_t @ self.V_o + self.b_o)
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * torch.tanh(c_t)
            hidden_seq.append(h_t.unsqueeze(0))
        hidden_seq = torch.cat(hidden_seq, dim=0)
        hidden_seq = hidden_seq.transpose(0, 1).contiguous()
        return hidden_seq 

class Net(nn.Module):
    def __init__(self, embedding_size = 64, hidden_size = 64, mlp_embedding_dim = 64, num_classes = 2):
        super(Net, self).__init__()
        # 词嵌入层
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        # LSTM层
        self.lstm = LSTM(input_size = embedding_size, hidden_size = hidden_size)
        # 全连接层
        self.linear1 = nn.Linear(in_features = hidden_size, out_features = mlp_embedding_dim)
        self.act1 = torch.nn.ReLU()
        self.linear2 = nn.Linear(in_features = mlp_embedding_dim, out_features = num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.lstm.forward(x)
        x = torch.mean(x, dim = 1)
        x = self.linear1(x)
        x = self.act1(x)
        x = self.linear2(x)
        return x      

Transfomer

代码略

期末复习笔记

目标检测

IoU，NMS 非极大值抑制

R-CNN，SPP Net（对 R-CNN 改进），再改进：Fast R-CNN，提出了 Region of interest，将分类器边框和特征题取器一起训练，仍然使用 Selective search ，Faster R-CNN，使用 Regional proposal 代替 Selective search

YOLO 划分格子 + NMS, v2，加先验框，SSD

图像分割

像素级分类，Fully convolutional network (FCN)，skip - connection

Segnet, PSPnet, U-net

像素级的交叉熵大物体的 loss 大，用 Dice 系数解决这个问题

姿态估计

top-down，先框后点

bottom-up，先点后框

Convolutional Pose Machine, CPM

openpose = CPM + bottom-up

pose proposal network PPN = YOLO + OpenPose

GAN (Vanilla GAN)

优化目标函数：，到达最优时以 0.5 的概率判别是 fake 还是 real

（事先将 D 设置为最优：可能会导致“模型训练不稳定”，“梯度小时”，“G 无法捕捉到真实数据分布“）

DCGAN — Deep Convolutional GAN

用卷积层

Loss: 使用均方误差 MSE 会导致一些细节不明显

Variational Autoencoder

X 解码为 z，z 通过 G 得到，然后算 L2 Loss

Conditional GAN

BiGAN （带 Encoder) ，无监督

CoGAN （换脸，换胡子，换头发 … )

Cycle GAN （学习未知的映射关系，和 CoGAN 差不多，风景转油画 … ）

RNN (Recurrent Neural Network）

Word Embedding

WordVec，Continuous Bag-of-Words，根据上下文预测中间的词，随机采样负样本（噪声对比估计 Noise - Contrastive Estimation）

LSTM

用来做情感分类，many to one，hw9

Transformer

机器翻译，many to many

Attention ( multi - head )，transformer 的结构，hw10

GPT Generative Pre-Trained Transformer，是基于 Transformer 的预训练提出的生成式语言模型（预测下一个词的概率分布）（单向预测）

BERT 双向上下文预测

均无监督，NLP - Natural Language Processing

AI 系统

数据获取，数据预处理，建模调参，系统部署

解析 HTML 语言的 python 工具包，beautifulsoup4。selenium 工具

XPath，在 XML 文档中查找信息的语言

深度学习利用网络自动题取特征，例如一些 CNN 实际上是题取出了特征

特征选择

1）Filter

Pearson 相关系数

Lasso 特征选择（因为有很多维数是 0）

Gini index：子树每种 label 的占比，Gini index 为

Gini decrease:

Gini importance:

使用随机森林训练后可以获得 Gini 系数

2) Wrapper

搜索，迭代，permutation 一下数据的特征，SHAP

集成学习

Bagging / variance

Boosting / bias

Stacking / variance

AutoGluon 可以自动进行模型选择

模型测试

超参数优化 HPO, Hyper Parameter Optimazation

神经网络架构搜索 NAS，Neural Architecture Search

HPO：多粒度（子数据集，减小模型大小，早停）

NAS：强化学习（速度慢），One-shot 方法（从头训练最有希望的候选项，只关心几个 epoch 后候选项的排名）