Sting · bed06eb9 · 4f733b30 · bed06eb9
--- a/cnn/model.py 0 → 100644

+ 193

− 0

Afficher le fichier @ 4f733b30

Ouvrir dans Web EDI
+++ b/cnn/model.py 0 → 100644

+ 193

− 0

Afficher le fichier @ 4f733b30

Ouvrir dans Web EDI
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+from tqdm import tqdm
+from cnn.music_processor import *
+
+"""
+On the paper,
+Starting from a stack of three spectrogram excerpts,
+convolution and max-pooling in turns compute a set of 20 feature maps 
+classified with a fully-connected network.
+"""
+
+class convNet(nn.Module):
+    """
+    copies the neural net used in a paper.
+    "Improved musical onset detection with Convolutional Neural Networks".
+    src: https://ieeexplore.ieee.org/document/6854953
+    """
+
+    def __init__(self):
+
+        super(convNet, self).__init__()
+        # model
+        self.conv1 = nn.Conv2d(3, 10, (3, 7))
+        self.conv2 = nn.Conv2d(10, 20, 3)
+        self.fc1 = nn.Linear(1120, 256)
+        self.fc2 = nn.Linear(256, 120)
+        self.fc3 = nn.Linear(120, 1)
+
+
+    def forward(self, x, istraining=False, minibatch=1):
+
+        x = F.max_pool2d(F.relu(self.conv1(x)), (3, 1))
+        x = F.max_pool2d(F.relu(self.conv2(x)), (3, 1))
+        x = F.dropout(x.view(minibatch, -1), training=istraining)
+        x = F.dropout(F.relu(self.fc1(x)), training=istraining)
+        x = F.dropout(F.relu(self.fc2(x)), training=istraining)
+
+        return F.sigmoid(self.fc3(x))
+
+
+    def train_data_builder(self, feats, answer, major_note_index, samplerate, soundlen=15, minibatch=1, split=0.2):
+        """
+        Args:
+            feats: song.feats; Audio module
+            answers: song.answers; Audio module
+            major_note_index: answer labels; corresponding to feats
+            samplerate: song.samplerate; Audio module
+            soundlen: =15. 学習モデルに渡す画像データの横方向の長さ．ここでは(80 * 15)サイズのデータを使用している
+            minibatch: training minibatch
+            split: =1. 
+        Variables:
+            minspace: minimum space between major note indexs
+            maxspace: maximum space between major note indexs
+            idx: index of major_note_index or feats
+            dist: distance of two notes
+        """
+
+        # acceptable interval in seconds
+        minspace = 0.1
+        maxspace = 0.7
+
+        idx = np.random.permutation(major_note_index.shape[0] - soundlen) + soundlen // 2
+        X, y = [], []
+        cnt = 0
+        
+        for i in range(int(idx.shape[0] * split)):
+
+            dist = major_note_index[idx[i] + 1] - major_note_index[idx[i]]  # distinguish by this value
+            
+            if dist < maxspace * samplerate / 512 and dist > minspace * samplerate / 512:    
+                for j in range(-1, dist + 2):
+                    X.append(feats[:, :, major_note_index[idx[i]] - soundlen // 2 + j : major_note_index[idx[i]] + soundlen // 2 + j + 1])
+                    y.append(answer[major_note_index[idx[i]] + j])
+                    cnt += 1
+                    
+                    if cnt % minibatch == 0:
+                        yield (torch.from_numpy(np.array(X)).float(), torch.from_numpy(np.array(y)).float())
+                        X, y = [], []
+
+
+    def infer_data_builder(self, feats, soundlen=15, minibatch=1):
+        
+        x = []
+        
+        for i in range(feats.shape[2] - soundlen):
+            x.append(feats[:, :, i:i+soundlen])
+        
+            if (i + 1) % minibatch == 0:
+                yield (torch.from_numpy(np.array(x)).float())
+                x = []
+        
+        if len(x) != 0:
+            yield (torch.from_numpy(np.array(x)).float())
+
+
+    def train(self, songs, minibatch, epoch, device, soundlen=15, val_song=None, save_place='./models/model.pth', log='./log/log.txt'):
+        """
+        Args:
+            songs: the list of song
+            minibatch: minibatch value
+            epoch: number of train 
+            device: cpu / gpu
+            soundlen: width of one train data's image
+            val_song: validation song, if you wanna validation while training, give a path of validation song data.
+            save_place: save place path
+            log: log place path
+            don-ka: don(1) or ka(2) or both(0), usually, firstly, train don, then, train ka.
+        """
+
+        for song in songs:
+            
+            timing = np.array([syl[0] for syl in song.timestamp])
+            syllable  = np.array([syl[1] for syl in song.timestamp])
+            song.answer = np.zeros((song.feats.shape[2]))
+
+           
+            song.major_note_index = np.rint(timing[np.where(syllable != "")] * song.samplerate/512).astype(np.int32)
+            
+            song.major_note_index = np.delete(song.major_note_index, np.where(song.major_note_index >= song.feats.shape[2]))
+
+            song.answer[song.major_note_index] = 1
+
+            song.answer = milden(song.answer)
+
+        # training
+        optimizer = optim.SGD(self.parameters(), lr=0.02)
+        criterion = nn.MSELoss()
+        running_loss = 0
+        val_loss = 0
+
+        for i in range(epoch):
+            for song in songs:
+                for X, y in self.train_data_builder(song.feats, song.answer, song.major_note_index, song.samplerate, soundlen, minibatch, split=0.2):
+                    optimizer.zero_grad()
+                    output = self(X.to(device), istraining=True, minibatch=minibatch)
+                    target = y.to(device)
+                    loss = criterion(output.squeeze(), target)
+                    loss.backward()
+                    optimizer.step()
+                    running_loss += loss.data.item()
+
+            with open(log, 'a') as f:
+                print("epoch: %.d running_loss: %.10f " % (i+1, running_loss), file=f)
+
+            print("epoch: %.d running_loss: %.10f" % (i+1, running_loss))
+            
+            running_loss = 0    
+
+            if val_song:
+                inference = torch.from_numpy(self.infer(val_song.feats, device, minibatch=512)).to(device)
+                target = torch.from_numpy(val_song.answer[:-soundlen]).float().to(device)
+                loss = criterion(inference.squeeze(), target)
+                val_loss = loss.data.item()
+
+                with open(log, 'a') as f:
+                    print("val_loss: %.10f " % (val_loss), file=f)
+
+        torch.save(self.state_dict(), save_place)
+
+
+    def infer(self, feats, device, minibatch=1):
+
+        with torch.no_grad():
+            inference = None
+            for x in tqdm(self.infer_data_builder(feats, minibatch=minibatch), total=feats.shape[2]//minibatch):
+                output = self(x.to(device), minibatch=x.shape[0])
+                if inference is not None:
+                    inference = np.concatenate((inference, output.cpu().numpy().reshape(-1)))
+                else:
+                    inference = output.cpu().numpy().reshape(-1)
+            
+            return np.array(inference).reshape(-1)
+
+
+if __name__ == '__main__':
+
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    net = convNet()
+    net = net.to(device)
+           
+    with open('./data/pickles/train_data.pickle', mode='rb') as f:
+        songs = pickle.load(f)
+
+    minibatch = 128
+    soundlen = 15
+    epoch = 100
+
+    
+    net.train(songs=songs, minibatch=minibatch, val_song=None, epoch=epoch, device=device, soundlen=soundlen, save_place='./models/model.pth', log='./data/log/log.txt')
+\ No newline at end of file