From f56ebd14bde28a5f339e9b7d5ca0f7ee83c2ac97 Mon Sep 17 00:00:00 2001
From: Sting <lallegre26@gmail.com>
Date: Thu, 20 Jul 2023 23:08:28 +0200
Subject: [PATCH] Test stuff, use LyrAlign on a syllable basis

---
 autokara.py                        |   2 +-
 autosyl/LyricsAlignment/model.py   |   2 +-
 autosyl/LyricsAlignment/wrapper.py |  20 +++--
 assUtils.py => autosyl/assUtils.py |   3 +-
 autosyl/segment.py                 | 123 ++++++++++++++++++++++++++---
 5 files changed, 128 insertions(+), 22 deletions(-)
 rename assUtils.py => autosyl/assUtils.py (98%)

diff --git a/autokara.py b/autokara.py
index 9cbf49d..f4d3ba6 100644
--- a/autokara.py
+++ b/autokara.py
@@ -4,8 +4,8 @@ import demucs.separate
 import subprocess
 import shlex
 from pathlib import Path
-from assUtils import AssWriter, getSyls
 
+from autosyl.assUtils import AssWriter, getSyls
 from autosyl.segment import segment
 
 
diff --git a/autosyl/LyricsAlignment/model.py b/autosyl/LyricsAlignment/model.py
index 33669d6..f6fd66b 100644
--- a/autosyl/LyricsAlignment/model.py
+++ b/autosyl/LyricsAlignment/model.py
@@ -4,7 +4,7 @@ import torch.nn.functional as F
 import torchaudio
 import warnings
 
-from utils import notes_to_pc
+from autosyl.LyricsAlignment.utils import notes_to_pc
 
 # following FFT parameters are designed for a 22.5k sampling rate
 sr = 22050
diff --git a/autosyl/LyricsAlignment/wrapper.py b/autosyl/LyricsAlignment/wrapper.py
index 1727822..120fcfe 100644
--- a/autosyl/LyricsAlignment/wrapper.py
+++ b/autosyl/LyricsAlignment/wrapper.py
@@ -5,8 +5,8 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-import utils
-from model import train_audio_transforms, AcousticModel, BoundaryDetection
+import autosyl.LyricsAlignment.utils as utils
+from autosyl.LyricsAlignment.model import train_audio_transforms, AcousticModel, BoundaryDetection
 
 np.random.seed(7)
 
@@ -17,7 +17,7 @@ def preprocess_from_file(audio_file, lyrics_file, word_file=None):
 
     return y, words, lyrics_p, idx_word_p, idx_line_p
 
-def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cuda=True):
+def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cuda=True, checkpoint_folder="."):
 
     # start timer
     t = time()
@@ -61,7 +61,7 @@ def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cud
     ).to(device)
 
     print("Loading acoustic model from checkpoint...")
-    state = utils.load_model(ac_model, "./checkpoints/checkpoint_{}".format(model_type), cuda=(device=="gpu"))
+    state = utils.load_model(ac_model, "{}/checkpoint_{}".format(checkpoint_folder, model_type), cuda=(device=="gpu"))
     ac_model.eval()
 
     print("Computing phoneme posteriorgram...")
@@ -106,7 +106,7 @@ def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cud
             bdr_hparams['n_feats'], bdr_hparams['stride'], bdr_hparams['dropout']
         ).to(device)
         print("Loading BDR model from checkpoint...")
-        state = utils.load_model(bdr_model, "./checkpoints/checkpoint_BDR", cuda=(device == "gpu"))
+        state = utils.load_model(bdr_model, "{}/checkpoint_BDR".format(checkpoint_folder), cuda=(device == "gpu"))
         bdr_model.eval()
 
         print("Computing boundary probability curve...")
@@ -128,6 +128,9 @@ def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cud
     t = time() - t
     print("Alignment Score:\t{}\tTime:\t{}".format(score, t))
 
+    resolution = 25600 / 22050 * 3
+    word_align = [[round(word[0] * resolution), round(word[1] * resolution)] for word in word_align]
+
     return word_align, words
 
 def preprocess_audio(audio_file, sr=22050):
@@ -140,7 +143,7 @@ def preprocess_audio(audio_file, sr=22050):
 
     return y, curr_sr
 
-def preprocess_lyrics(lyrics_file, word_file=None):
+def preprocess_lyrics(lyrics_lines, word_file=None):
     from string import ascii_lowercase
     d = {ascii_lowercase[i]: i for i in range(26)}
     d["'"] = 26
@@ -148,8 +151,9 @@ def preprocess_lyrics(lyrics_file, word_file=None):
     d["~"] = 28
 
     # process raw
-    with open(lyrics_file, 'r') as f:
-        raw_lines = f.read().splitlines()
+    #with open(lyrics_file, 'r') as f:
+    #    raw_lines = f.read().splitlines()
+    raw_lines = lyrics_lines
 
     raw_lines = ["".join([c for c in line.lower() if c in d.keys()]).strip() for line in raw_lines]
     raw_lines = [" ".join(line.split()) for line in raw_lines if len(line) > 0]
diff --git a/assUtils.py b/autosyl/assUtils.py
similarity index 98%
rename from assUtils.py
rename to autosyl/assUtils.py
index 7b5fd28..a9c4444 100644
--- a/assUtils.py
+++ b/autosyl/assUtils.py
@@ -33,7 +33,8 @@ def getSyls(ass_file):
             syl_line = []
             lastTime = dateToTime(line[0])
             for couple in RGX_TAGS.findall(line[2]):
-                syl_line.append([lastTime, couple[1], int(couple[0])])
+                if(couple[1] != ''):
+                    syl_line.append([lastTime, couple[1], int(couple[0])])
                 lastTime += int(couple[0])
             syl_line.append([lastTime, '', 0])
             SYLS.append(syl_line)
diff --git a/autosyl/segment.py b/autosyl/segment.py
index 5f4d785..8b63283 100644
--- a/autosyl/segment.py
+++ b/autosyl/segment.py
@@ -6,12 +6,47 @@ import matplotlib.pyplot as plt
 import scipy.signal as sg
 import parselmouth
 
+from autosyl.assUtils import getSyls, timeToDate, dateToTime 
+from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file
+
+
 
 def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500):
 
     delay = -4
     backtrack = False
 
+
+    audio_file = songfile                      # pre-computed source-separated vocals; These models do not work with mixture input.
+    word_file = None                           # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it
+    method = "MTL_BDR"                             # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR"
+    cuda=True                                 # set True if you have access to a GPU
+    checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints"
+
+    pred_file = "./MTL.csv"                    # saved alignment results, "(float) start_time, (float) end_time, (string) word"
+
+
+    lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls]
+    #print(lyrics_lines)
+
+
+    # load audio and lyrics
+    # words:        a list of words
+    # lyrics_p:     phoneme sequence of the target lyrics
+    # idx_word_p:   indices of word start in lyrics_p
+    # idx_line_p:   indices of line start in lyrics_p
+    audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file)
+
+    # compute alignment
+    # word_align:   a list of frame indices aligned to each word
+    # words:        a list of words
+    word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder)
+
+
+    words_onsets = np.array([word_align[i][0] for i in range(len(word_align))])
+
+
+
     cnn = madmom.features.onsets.CNNOnsetProcessor()
     spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler')
 
@@ -91,6 +126,7 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500):
             while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames):
                 onsets[i] -= 1
 
+    onsets = words_onsets
     onsets = (onsets + delay)/100
     #print(onsets)
 
@@ -159,10 +195,42 @@ if __name__ == "__main__":
     else:
         reference_syls = None
     
-    #print(reference_syls)
+    print(reference_syls)
 
     backtrack = False
 
+
+    
+
+    audio_file = songfile                      # pre-computed source-separated vocals; These models do not work with mixture input.
+    word_file = None                           # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it
+    method = "MTL_BDR"                             # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR"
+    cuda=True                                 # set True if you have access to a GPU
+    checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints"
+
+    pred_file = "./MTL.csv"                    # saved alignment results, "(float) start_time, (float) end_time, (string) word"
+
+
+    lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls]
+    #print(lyrics_lines)
+
+
+    # load audio and lyrics
+    # words:        a list of words
+    # lyrics_p:     phoneme sequence of the target lyrics
+    # idx_word_p:   indices of word start in lyrics_p
+    # idx_line_p:   indices of line start in lyrics_p
+    audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file)
+
+    # compute alignment
+    # word_align:   a list of frame indices aligned to each word
+    # words:        a list of words
+    word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder)
+
+
+    print([[word_align[i][0], word_align[i][1], words[i]] for i in range(len(word_align))])
+
+
     cnn = madmom.features.onsets.CNNOnsetProcessor()
     spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler')
 
@@ -218,18 +286,51 @@ if __name__ == "__main__":
 
     if reference_syls:
         filtered_onsets = []
-        line_index = 0
+        word_index = 0
+        previous_onset = 0
         for line in reference_syls:
-            line_index += 1
-            syl_number = len(line) - 1
-            line_onsets = [o for o in onsets if (o >= line[0][0] and o <= line[-1][0])]
-            line_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x]))
-            if syl_number > len(line_onsets):
-                print("WARNING : failed to detect enough onsets in line %d (%d, %d)" % (line_index, line[0][0], line[-1][0]))
-            filtered_onsets += line_onsets[0:syl_number]
-        
+            word_line = []
+            word_tmp = ""
+            word_syl_count = 0
+            for syl_index in range(len(line) - 1):
+                word_tmp += line[syl_index][1].strip()
+                word_syl_count += 1
+                #print("%s : %s" % (word_tmp, words[word_index]))
+                if(word_tmp == words[word_index]):
+                    print("Word %d recognized : %s" % (word_index, word_tmp))
+                    word_line.append([word_align[word_index][0], word_align[word_index][1], words[word_index], word_syl_count, 0])
+                    word_index += 1
+                    word_tmp = ""
+                    word_syl_count = 0 
+
+                num_words_done = 0
+                while(num_words_done < len(word_line)):
+                    for word in range(len(word_line)):
+                        window_start = word[0]
+
+            print(word_line)
+
         onsets = np.array(sorted(filtered_onsets))
-    
+
+    """ 
+        if word_index > 0:
+            word_start = max(word_align[word_index][0] - 5, line[0][0], previous_onset+1)
+        else:
+            word_start = line[0][0]
+        if word_index < len(words) - 1 and syl_index < len(line) - 2:
+            word_end = min(line[-1][0], word_align[word_index + 1][0] - 5)
+        else:
+            word_end = line[-1][0]
+
+        word_onsets = [o for o in onsets if (o >= word_start and o <= word_end)]
+        word_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x]))
+        if word_syl_count > len(word_onsets):
+            print("WARNING : failed to detect enough onsets in word %s (%d, %d)" % (word_tmp, word_start, word_end))
+        filtered_onsets += word_onsets[0:word_syl_count]
+        print(word_onsets[0:word_syl_count])
+        previous_onset = max(word_onsets[0:word_syl_count] + [0])
+    """
+
     # Backtrack onsets to closest earlier local minimum
     if backtrack:
         backtrack_max_frames = 50
-- 
GitLab