实际操作一下语音brid单词识别

实际操作一下语音 brid 单词识别

第一步：定义一个函数

推荐使用了 DSCNN 神经网络

# 具体的网络结构
# models/ds_cnn.py
import torch.nn as nn


class DSCNN(nn.Module):
    """深度可分离卷积网络"""

    def __init__(self, num_classes=4, input_channels=1):
        super().__init__()

        # 特征提取层
        self.features = nn.Sequential(
            # 标准卷积层
            nn.Conv2d(input_channels, 64, kernel_size=(3, 3), stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2)),

            # 深度可分离卷积层
            nn.Conv2d(64, 64, kernel_size=(3, 3), groups=64, padding=1),  # Depthwise
            nn.Conv2d(64, 128, kernel_size=1),  # Pointwise
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2)),

            # 深度可分离卷积层
            nn.Conv2d(128, 128, kernel_size=(3, 3), groups=128, padding=1),
            nn.Conv2d(128, 256, kernel_size=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2)),
        )

        # 分类器
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

第二步：定义一个 Loss

这个是一个分类问题，将一个读音分为是 brid、other、background 这几类，所以 Loss 使用 CrossEntropyLoss

criterion = nn.CrossEntropyLoss()

第三步：Optimization

优化器引入 Adam

optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])

开练，使用 backpropagation

def train_epoch(model, dataloader, criterion, optimizer, device):
    """训练一个epoch"""
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for inputs, labels in tqdm(dataloader, desc="训练"):
        inputs, labels = inputs.to(device), labels.to(device)

        # 前向传播
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # 反向传播
        loss.backward()
        optimizer.step()

        # 统计
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    return total_loss / len(dataloader), 100. * correct / total

image.png|550