model.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm import tqdm
from cnn.music_processor import *

"""
On the paper,
Starting from a stack of three spectrogram excerpts,
convolution and max-pooling in turns compute a set of 20 feature maps
classified with a fully-connected network.
"""

class convNet(nn.Module):
    """
    copies the neural net used in a paper.
    "Improved musical onset detection with Convolutional Neural Networks".
    src: https://ieeexplore.ieee.org/document/6854953
    """

    def __init__(self):

        super(convNet, self).__init__()
        # model
        self.conv1 = nn.Conv2d(3, 10, (3, 7))
        self.conv2 = nn.Conv2d(10, 20, 3)
        self.fc1 = nn.Linear(1120, 256)
        self.fc2 = nn.Linear(256, 120)
        self.fc3 = nn.Linear(120, 1)


    def forward(self, x, istraining=False, minibatch=1):

        x = F.max_pool2d(F.relu(self.conv1(x)), (3, 1))
        x = F.max_pool2d(F.relu(self.conv2(x)), (3, 1))
        x = F.dropout(x.view(minibatch, -1), training=istraining)
        x = F.dropout(F.relu(self.fc1(x)), training=istraining)
        x = F.dropout(F.relu(self.fc2(x)), training=istraining)

        return F.sigmoid(self.fc3(x))


    def train_data_builder(self, feats, answer, major_note_index, samplerate, soundlen=15, minibatch=1, split=0.2):
        """
        Args:
            feats: song.feats; Audio module
            answers: song.answers; Audio module
            major_note_index: answer labels; corresponding to feats
            samplerate: song.samplerate; Audio module
            soundlen: =15. 学習モデルに渡す画像データの横方向の長さ．ここでは(80 * 15)サイズのデータを使用している
            minibatch: training minibatch
            split: =1.
        Variables:
            minspace: minimum space between major note indexs
            maxspace: maximum space between major note indexs
            idx: index of major_note_index or feats
            dist: distance of two notes
        """

        # acceptable interval in seconds
        minspace = 0.1
        maxspace = 0.7

        idx = np.random.permutation(major_note_index.shape[0] - soundlen) + soundlen // 2
        X, y = [], []
        cnt = 0

        for i in range(int(idx.shape[0] * split)):

            dist = major_note_index[idx[i] + 1] - major_note_index[idx[i]]  # distinguish by this value

            if dist < maxspace * samplerate / 512 and dist > minspace * samplerate / 512:
                for j in range(-1, dist + 2):
                    X.append(feats[:, :, major_note_index[idx[i]] - soundlen // 2 + j : major_note_index[idx[i]] + soundlen // 2 + j + 1])
                    y.append(answer[major_note_index[idx[i]] + j])
                    cnt += 1

                    if cnt % minibatch == 0:
                        yield (torch.from_numpy(np.array(X)).float(), torch.from_numpy(np.array(y)).float())
                        X, y = [], []


    def infer_data_builder(self, feats, soundlen=15, minibatch=1):

        x = []

        for i in range(feats.shape[2] - soundlen):
            x.append(feats[:, :, i:i+soundlen])

            if (i + 1) % minibatch == 0:
                yield (torch.from_numpy(np.array(x)).float())
                x = []

        if len(x) != 0:
            yield (torch.from_numpy(np.array(x)).float())


    def train(self, songs, minibatch, epoch, device, soundlen=15, val_song=None, save_place='./models/model.pth', log='./log/log.txt'):
        """
        Args:
            songs: the list of song
            minibatch: minibatch value
            epoch: number of train
            device: cpu / gpu
            soundlen: width of one train data's image
            val_song: validation song, if you wanna validation while training, give a path of validation song data.
            save_place: save place path
            log: log place path
            don-ka: don(1) or ka(2) or both(0), usually, firstly, train don, then, train ka.
        """

        for song in songs:

            timing = np.array([syl[0] for syl in song.timestamp])
            syllable  = np.array([syl[1] for syl in song.timestamp])
            song.answer = np.zeros((song.feats.shape[2]))


            song.major_note_index = np.rint(timing[np.where(syllable != "")] * song.samplerate/512).astype(np.int32)

            song.major_note_index = np.delete(song.major_note_index, np.where(song.major_note_index >= song.feats.shape[2]))

            song.answer[song.major_note_index] = 1

            song.answer = milden(song.answer)

        # training
        optimizer = optim.SGD(self.parameters(), lr=0.02)
        criterion = nn.MSELoss()
        running_loss = 0
        val_loss = 0

        for i in range(epoch):
            for song in songs:
                for X, y in self.train_data_builder(song.feats, song.answer, song.major_note_index, song.samplerate, soundlen, minibatch, split=0.2):
                    optimizer.zero_grad()
                    output = self(X.to(device), istraining=True, minibatch=minibatch)
                    target = y.to(device)
                    loss = criterion(output.squeeze(), target)
                    loss.backward()
                    optimizer.step()
                    running_loss += loss.data.item()

            with open(log, 'a') as f:
                print("epoch: %.d running_loss: %.10f " % (i+1, running_loss), file=f)

            print("epoch: %.d running_loss: %.10f" % (i+1, running_loss))

            running_loss = 0

            if val_song:
                inference = torch.from_numpy(self.infer(val_song.feats, device, minibatch=512)).to(device)
                target = torch.from_numpy(val_song.answer[:-soundlen]).float().to(device)
                loss = criterion(inference.squeeze(), target)
                val_loss = loss.data.item()

                with open(log, 'a') as f:
                    print("val_loss: %.10f " % (val_loss), file=f)

        torch.save(self.state_dict(), save_place)


    def infer(self, feats, device, minibatch=1):

        with torch.no_grad():
            inference = None
            for x in tqdm(self.infer_data_builder(feats, minibatch=minibatch), total=feats.shape[2]//minibatch):
                output = self(x.to(device), minibatch=x.shape[0])
                if inference is not None:
                    inference = np.concatenate((inference, output.cpu().numpy().reshape(-1)))
                else:
                    inference = output.cpu().numpy().reshape(-1)

            return np.array(inference).reshape(-1)


if __name__ == '__main__':

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    net = convNet()
    net = net.to(device)

    with open('./data/pickles/train_data.pickle', mode='rb') as f:
        songs = pickle.load(f)

    minibatch = 128
    soundlen = 15
    epoch = 100


    net.train(songs=songs, minibatch=minibatch, val_song=None, epoch=epoch, device=device, soundlen=soundlen, save_place='./models/model.pth', log='./data/log/log.txt')