モデルビルダー

学習目標: 層を組み合わせて典型的なニューラルネットワークを構築するパターンを身につける

MLP (多層パーセプトロン)

柔軟な層数の MLP

import torch
import torch.nn as nn

class MLP(nn.Module):
    """任意の隠れ層リストでMLPを組む"""
    def __init__(self, input_dim, hidden_dims, output_dim,
                 activation='relu', dropout=0.0, batch_norm=False):
        super().__init__()
        layers = []
        dims = [input_dim] + list(hidden_dims) + [output_dim]
        act = {'relu': nn.ReLU, 'gelu': nn.GELU,
               'tanh': nn.Tanh, 'leaky': lambda: nn.LeakyReLU(0.2)}[activation]

        for i in range(len(dims) - 1):
            layers.append(nn.Linear(dims[i], dims[i+1]))
            if i < len(dims) - 2:                          # 最後の層には付けない
                if batch_norm:
                    layers.append(nn.BatchNorm1d(dims[i+1]))
                layers.append(act())
                if dropout > 0:
                    layers.append(nn.Dropout(dropout))

        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


model = MLP(784, [512, 256, 128], 10,
            activation='gelu', dropout=0.2, batch_norm=True)
print(model)

タスク別の最終層

タスク最終層損失関数
多クラス分類Linear → softmaxは損失内CrossEntropyLoss
多ラベル分類Linear → SigmoidBCEWithLogitsLoss
回帰Linear(活性化なし)MSELoss
順序回帰Linear → cumulativeCORN / OrdinalLoss

CNN ビルダー

基本的なCNN(VGGスタイル)

def conv_block(in_ch, out_ch, n_convs=2, pool=True):
    """conv + BN + ReLU を n_convs 回 + プール"""
    layers = []
    for i in range(n_convs):
        layers += [
            nn.Conv2d(in_ch if i == 0 else out_ch, out_ch, 3, padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),
        ]
    if pool:
        layers.append(nn.MaxPool2d(2))
    return nn.Sequential(*layers)


class SimpleCNN(nn.Module):
    def __init__(self, n_classes=10, channels=(64, 128, 256, 512)):
        super().__init__()
        blocks = []
        in_ch = 3
        for out_ch in channels:
            blocks.append(conv_block(in_ch, out_ch))
            in_ch = out_ch
        self.features = nn.Sequential(*blocks)
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(channels[-1], n_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.pool(x).flatten(1)
        return self.fc(x)

残差ブロック (ResNet-style)

class ResBlock(nn.Module):
    def __init__(self, in_ch, out_ch, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_ch, out_ch, 3, stride, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(out_ch)
        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(out_ch)
        self.shortcut = (
            nn.Sequential(
                nn.Conv2d(in_ch, out_ch, 1, stride, bias=False),
                nn.BatchNorm2d(out_ch),
            ) if stride != 1 or in_ch != out_ch else nn.Identity()
        )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = out + self.shortcut(x)
        return F.relu(out)

Pre-trained backbone を使う

from torchvision import models

# ResNet50 を特徴抽出器として使う
backbone = models.resnet50(weights='IMAGENET1K_V1')
backbone.fc = nn.Linear(backbone.fc.in_features, n_classes)

# 全部固定 → 最終層だけ学習(linear probing)
for p in backbone.parameters():
    p.requires_grad = False
for p in backbone.fc.parameters():
    p.requires_grad = True

系列モデル

LSTM / GRU

class SeqClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden=256, n_classes=2,
                 bidirectional=True, num_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, hidden, num_layers=num_layers,
                           batch_first=True, bidirectional=bidirectional,
                           dropout=0.3)
        n_dirs = 2 if bidirectional else 1
        self.fc = nn.Linear(hidden * n_dirs, n_classes)

    def forward(self, x, lengths=None):
        e = self.embed(x)
        if lengths is not None:
            packed = nn.utils.rnn.pack_padded_sequence(
                e, lengths.cpu(), batch_first=True, enforce_sorted=False)
            _, (h, _) = self.rnn(packed)
        else:
            _, (h, _) = self.rnn(e)
        # 最後の層の双方向を結合
        h = torch.cat([h[-2], h[-1]], dim=1)
        return self.fc(h)

Mini Transformer

class MiniTransformer(nn.Module):
    def __init__(self, vocab_size, embed=128, n_heads=4, n_layers=4,
                 max_len=512, n_classes=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed)
        self.pos   = nn.Embedding(max_len, embed)
        enc_layer = nn.TransformerEncoderLayer(embed, n_heads,
                                               dim_feedforward=embed*4,
                                               dropout=0.1,
                                               batch_first=True,
                                               activation='gelu')
        self.encoder = nn.TransformerEncoder(enc_layer, n_layers)
        self.cls = nn.Linear(embed, n_classes)

    def forward(self, x, mask=None):
        pos_ids = torch.arange(x.size(1), device=x.device)
        h = self.embed(x) + self.pos(pos_ids)
        h = self.encoder(h, src_key_padding_mask=mask)
        return self.cls(h.mean(dim=1))         # mean pooling

設定駆動でモデルを作る

YAML/dict で構成を表現し、コード変更なしで実験できる作り。Hydra や OmegaConf と相性◎。

def build_model(config):
    """config: {'type': 'mlp'|'cnn', ...}"""
    if config['type'] == 'mlp':
        return MLP(
            input_dim=config['input_dim'],
            hidden_dims=config['hidden_dims'],
            output_dim=config['output_dim'],
            activation=config.get('activation', 'relu'),
            dropout=config.get('dropout', 0.0),
            batch_norm=config.get('batch_norm', False),
        )
    elif config['type'] == 'cnn':
        return SimpleCNN(
            n_classes=config['n_classes'],
            channels=tuple(config.get('channels', (64, 128, 256, 512))),
        )
    elif config['type'] == 'resnet':
        from torchvision import models
        m = getattr(models, config['backbone'])(weights='IMAGENET1K_V1')
        m.fc = nn.Linear(m.fc.in_features, config['n_classes'])
        return m
    raise ValueError(f"unknown type: {config['type']}")


# 使用例(YAMLから読む)
config = {
    'type': 'mlp',
    'input_dim': 784,
    'hidden_dims': [512, 256],
    'output_dim': 10,
    'activation': 'gelu',
    'dropout': 0.2,
    'batch_norm': True,
}
model = build_model(config)

パラメータ数の確認とサマリ

def model_summary(model, input_size):
    n_params = sum(p.numel() for p in model.parameters())
    n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Parameters: {n_params/1e6:.2f}M (trainable {n_trainable/1e6:.2f}M)")

    # 各層の出力サイズ(torchinfo を使うのが楽)
    try:
        from torchinfo import summary
        summary(model, input_size=input_size)
    except ImportError:
        print(model)